diff options
author | Anand Avati <avati@redhat.com> | 2014-01-16 16:14:36 -0800 |
---|---|---|
committer | Vijay Bellur <vbellur@redhat.com> | 2014-03-22 05:25:57 -0700 |
commit | 6d3739292b7b51d2ddbab75b5f884fb38925b943 (patch) | |
tree | cf332a881a49c0904a7e023935750c2d080fc1c5 /xlators | |
parent | eb87c96f49b3dd2c7460e58c54ce909c706cd475 (diff) |
cluster/afr: refactor
- Remove client side self-healing completely (opendir, openfd, lookup)
- Re-work readdir-failover to work reliably in case of NFS
- Remove unused/dead lock recovery code
- Consistently use xdata in both calls and callbacks in all FOPs
- Per-inode event generation, used to force inode ctx refresh
- Implement dirty flag support (in place of pending counts)
- Eliminate inode ctx structure, use read subvol bits + event_generation
- Implement inode ctx refreshing based on event generation
- Provide backward compatibility in transactions
- remove unused variables and functions
- make code more consistent in style and pattern
- regularize and clean up inode-write transaction code
- regularize and clean up dir-write transaction code
- regularize and clean up common FOPs
- reorganize transaction framework code
- skip setting xattrs in pending dict if nothing is pending
- re-write self-healing code using syncops
- re-write simpler self-heal-daemon
Change-Id: I1e4080c9796c8a2815c2dab4be3073f389d614a8
BUG: 1021686
Signed-off-by: Anand Avati <avati@redhat.com>
Reviewed-on: http://review.gluster.org/6010
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
Diffstat (limited to 'xlators')
31 files changed, 8805 insertions, 19320 deletions
diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am index 35d18a6c0da..ea5a90abbdb 100644 --- a/xlators/cluster/afr/src/Makefile.am +++ b/xlators/cluster/afr/src/Makefile.am @@ -2,24 +2,26 @@ xlator_LTLIBRARIES = afr.la pump.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \ - afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c \ - afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c \ - afr-self-heal-algorithm.c afr-lk-common.c afr-self-heald.c \ + afr-inode-write.c afr-open.c afr-transaction.c afr-lk-common.c \ + afr-read-txn.c \ $(top_builddir)/xlators/lib/src/libxlator.c +AFR_SELFHEAL_SOURCES = afr-self-heal-common.c afr-self-heal-data.c \ + afr-self-heal-entry.c afr-self-heal-metadata.c afr-self-heald.c \ + afr-self-heal-name.c + afr_la_LDFLAGS = -module -avoid-version -afr_la_SOURCES = $(afr_common_source) afr.c +afr_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) afr.c afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la pump_la_LDFLAGS = -module -avoid-version -pump_la_SOURCES = $(afr_common_source) pump.c +pump_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) pump.c pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h \ - afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h \ - afr-self-heal-algorithm.h pump.h afr-mem-types.h afr-common.c \ - afr-self-heald.h $(top_builddir)/xlators/lib/src/libxlator.h \ - $(top_builddir)/glusterfsd/src/glusterfsd.h + afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-mem-types.h \ + afr-common.c afr-self-heald.h pump.h \ + $(top_builddir)/xlators/lib/src/libxlator.h AM_CPPFLAGS = $(GF_CPPFLAGS) \ -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ @@ -31,7 +33,6 @@ CLEANFILES = uninstall-local: rm -f $(DESTDIR)$(xlatordir)/replicate.so - rm -f $(DESTDIR)$(xlatordir)/pump.so install-data-hook: ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 224d3054626..2bab0f8533d 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -45,787 +45,797 @@ #include "afr-dir-write.h" #include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" #include "afr-self-heald.h" -#include "pump.h" -#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000100000000ULL -#define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL -#define AFR_STATISTICS_HISTORY_SIZE 50 -int -afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, - gf_boolean_t fail_conflict); -void -afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count) -{ - int i = 0; - for (i = 0; i < child_count; i++) - dst[i] = src[i]; -} - -void -afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path) +call_frame_t * +afr_copy_frame (call_frame_t *base) { - int i = 0; - afr_private_t *priv = NULL; - int ret = 0; + afr_local_t *local = NULL; + call_frame_t *frame = NULL; + int op_errno = 0; - priv = this->private; + frame = copy_frame (base); + if (!frame) + return NULL; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) { + AFR_STACK_DESTROY (frame); + return NULL; + } - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_uint64 (xattr_req, priv->pending_key[i], - 3 * sizeof(int32_t)); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - path, priv->pending_key[i]); - /* 3 = data+metadata+entry */ - } - ret = dict_set_int32 (xattr_req, GF_GFIDLESS_LOOKUP, 1); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, "%s: failed to set gfidless " - "lookup", path); - } + return frame; } +/* + * INODE CTX 64-bit VALUE FORMAT FOR SMALL (<= 16) SUBVOL COUNTS: + * + * |<---------- 64bit ------------>| + * 63 32 31 16 15 0 + * | EVENT_GEN | DATA | METADATA | + * + * + * METADATA (bit-0 .. bit-15): bitmap representing subvolumes from which + * metadata can be attempted to be read. + * + * bit-0 => priv->subvolumes[0] + * bit-1 => priv->subvolumes[1] + * ... etc. till bit-15 + * + * DATA (bit-16 .. bit-31): bitmap representing subvolumes from which data + * can be attempted to be read. + * + * bit-16 => priv->subvolumes[0] + * bit-17 => priv->subvolumes[1] + * ... etc. till bit-31 + * + * EVENT_GEN (bit-32 .. bit-63): event generation (i.e priv->event_generation) + * when DATA and METADATA was last updated. + * + * If EVENT_GEN is < priv->event_generation, + * or is 0, it means afr_inode_refresh() needs + * to be called to recalculate the bitmaps. + */ + int -afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this, - dict_t *xattr_req, loc_t *loc, void **gfid_req) +__afr_inode_read_subvol_get_small (inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int *event_p) { - int ret = -ENOMEM; + afr_private_t *priv = NULL; + int ret = -1; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint32_t event = 0; + uint64_t val = 0; + int i = 0; - GF_ASSERT (gfid_req); + priv = this->private; - *gfid_req = NULL; - local->xattr_req = dict_new (); - if (!local->xattr_req) - goto out; - if (xattr_req) - dict_copy (xattr_req, local->xattr_req); + ret = __inode_ctx_get (inode, this, &val); + if (ret < 0) + return ret; - afr_xattr_req_prepare (this, local->xattr_req, loc->path); - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_INODELK_COUNT); - } - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_ENTRYLK_COUNT); - } + metadatamap = (val & 0x000000000000ffff); + datamap = (val & 0x00000000ffff0000) >> 16; + event = (val & 0xffffffff00000000) >> 32; - ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_PARENT_ENTRYLK); - } + for (i = 0; i < priv->child_count; i++) { + if (metadata) + metadata[i] = (metadatamap >> i) & 1; + if (data) + data[i] = (datamap >> i) & 1; + } - ret = dict_get_ptr (local->xattr_req, "gfid-req", gfid_req); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: failed to get the gfid from dict", loc->path); - *gfid_req = NULL; - } else { - if (loc->parent != NULL) - dict_del (local->xattr_req, "gfid-req"); - } - ret = 0; -out: - return ret; + if (event_p) + *event_p = event; + return ret; } -void -afr_lookup_save_gfid (uuid_t dst, void* new, const loc_t *loc) -{ - inode_t *inode = NULL; - - inode = loc->inode; - if (inode && !uuid_is_null (inode->gfid)) - uuid_copy (dst, inode->gfid); - else if (!uuid_is_null (loc->gfid)) - uuid_copy (dst, loc->gfid); - else if (new && !uuid_is_null (new)) - uuid_copy (dst, new); -} int -afr_errno_count (int32_t *children, int *child_errno, - unsigned int child_count, int32_t op_errno) -{ - int i = 0; - int errno_count = 0; - int child = 0; +__afr_inode_read_subvol_set_small (inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int event) +{ + afr_private_t *priv = NULL; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint64_t val = 0; + int i = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (data[i]) + datamap |= (1 << i); + if (metadata[i]) + metadatamap |= (1 << i); + } - for (i = 0; i < child_count; i++) { - if (children) { - child = children[i]; - if (child == -1) - break; - } else { - child = i; - } - if (child_errno[child] == op_errno) - errno_count++; - } - return errno_count; -} + val = ((uint64_t) metadatamap) | + (((uint64_t) datamap) << 16) | + (((uint64_t) event) << 32); -int32_t -afr_set_dict_gfid (dict_t *dict, uuid_t gfid) -{ - int ret = 0; - uuid_t *pgfid = NULL; + return __inode_ctx_set (inode, this, &val); +} - GF_ASSERT (gfid); - pgfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char); - if (!pgfid) { - ret = -1; - goto out; - } +int +__afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this) +{ + int ret = -1; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint32_t event = 0; + uint64_t val = 0; - uuid_copy (*pgfid, gfid); + ret = __inode_ctx_get (inode, this, &val); + (void) ret; - ret = dict_set_dynptr (dict, "gfid-req", pgfid, sizeof (uuid_t)); - if (ret) - gf_log (THIS->name, GF_LOG_ERROR, "gfid set failed"); + metadatamap = (val & 0x000000000000ffff) >> 0; + datamap = (val & 0x00000000ffff0000) >> 16; + event = 0; -out: - if (ret && pgfid) - GF_FREE (pgfid); + val = ((uint64_t) metadatamap) | + (((uint64_t) datamap) << 16) | + (((uint64_t) event) << 32); - return ret; + return __inode_ctx_set (inode, this, &val); } -void -afr_inode_ctx_destroy (afr_inode_ctx_t *ctx) -{ - if (!ctx) - return; - GF_FREE (ctx->fresh_children); - GF_FREE (ctx); -} -afr_inode_ctx_t* -__afr_inode_ctx_get (inode_t *inode, xlator_t *this) +int +__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int *event_p) { - int ret = 0; - uint64_t ctx_addr = 0; - afr_inode_ctx_t *ctx = NULL; - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; + int ret = -1; - priv = this->private; - ret = __inode_ctx_get (inode, this, &ctx_addr); - if (ret < 0) - ctx_addr = 0; - if (ctx_addr != 0) { - ctx = (afr_inode_ctx_t*) (long) ctx_addr; - goto out; - } - ctx = GF_CALLOC (1, sizeof (*ctx), - gf_afr_mt_inode_ctx_t); - if (!ctx) - goto fail; - ctx->fresh_children = GF_CALLOC (priv->child_count, - sizeof (*ctx->fresh_children), - gf_afr_mt_int32_t); - if (!ctx->fresh_children) - goto fail; - ret = __inode_ctx_put (inode, this, (uint64_t)ctx); - if (ret) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to " - "set the inode ctx (%s)", - uuid_utoa (inode->gfid)); - goto fail; - } + priv = this->private; -out: - return ctx; + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_get_small (inode, this, data, + metadata, event_p); + else + /* TBD: allocate structure with array and read from it */ + ret = -1; -fail: - afr_inode_ctx_destroy (ctx); - return NULL; + return ret; } -afr_inode_ctx_t* -afr_inode_ctx_get (inode_t *inode, xlator_t *this) + +int +__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int event) { - afr_inode_ctx_t *ctx = NULL; + afr_private_t *priv = NULL; + int ret = -1; - LOCK (&inode->lock); - { - ctx = __afr_inode_ctx_get (inode, this); - } - UNLOCK (&inode->lock); - return ctx; + priv = this->private; + + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_set_small (inode, this, data, + metadata, event); + else + ret = -1; + + return ret; } -void -afr_inode_get_ctx_params (xlator_t *this, inode_t *inode, - afr_inode_params_t *params) + +int +__afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this) { - GF_ASSERT (inode); - GF_ASSERT (params); + afr_private_t *priv = NULL; + int ret = -1; - afr_inode_ctx_t *ctx = NULL; - afr_private_t *priv = NULL; - int i = 0; - int32_t read_child = -1; - int32_t *fresh_children = NULL; + priv = this->private; - priv = this->private; - LOCK (&inode->lock); - { - ctx = __afr_inode_ctx_get (inode, this); - if (!ctx) - goto unlock; - switch (params->op) { - case AFR_INODE_GET_READ_CTX: - fresh_children = params->u.read_ctx.children; - read_child = (int32_t)(ctx->masks & - AFR_ICTX_READ_CHILD_MASK); - params->u.read_ctx.read_child = read_child; - if (!fresh_children) - goto unlock; - for (i = 0; i < priv->child_count; i++) - fresh_children[i] = ctx->fresh_children[i]; - break; - case AFR_INODE_GET_OPENDIR_DONE: - params->u.value = _gf_false; - if (ctx->masks & AFR_ICTX_OPENDIR_DONE_MASK) - params->u.value = _gf_true; - break; - default: - GF_ASSERT (0); - break; - } - } -unlock: - UNLOCK (&inode->lock); + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_reset_small (inode, this); + else + ret = -1; + + return ret; } -gf_boolean_t -afr_is_split_brain (xlator_t *this, inode_t *inode) + +int +afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int *event_p) { - afr_inode_ctx_t *ctx = NULL; - gf_boolean_t spb = _gf_false; + int ret = -1; - ctx = afr_inode_ctx_get (inode, this); - if (!ctx) - goto out; - if ((ctx->mdata_spb == SPB) || (ctx->data_spb == SPB)) - spb = _gf_true; -out: - return spb; + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_get (inode, this, data, + metadata, event_p); + } + UNLOCK(&inode->lock); + + return ret; } -gf_boolean_t -afr_is_opendir_done (xlator_t *this, inode_t *inode) + +int +afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int event) { - afr_inode_params_t params = {0}; + int ret = -1; + + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_set (inode, this, data, metadata, + event); + } + UNLOCK(&inode->lock); - params.op = AFR_INODE_GET_OPENDIR_DONE; - afr_inode_get_ctx_params (this, inode, ¶ms); - return params.u.value; + return ret; } -int32_t -afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children) + +int +afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this) { - afr_inode_params_t params = {0}; + int ret = -1; + + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_reset (inode, this); + } + UNLOCK(&inode->lock); - params.op = AFR_INODE_GET_READ_CTX; - params.u.read_ctx.children = fresh_children; - afr_inode_get_ctx_params (this, inode, ¶ms); - return params.u.read_ctx.read_child; + return ret; } -void -afr_inode_ctx_set_read_child (afr_inode_ctx_t *ctx, int32_t read_child) -{ - uint64_t remaining_mask = 0; - uint64_t mask = 0; - remaining_mask = (~AFR_ICTX_READ_CHILD_MASK & ctx->masks); - mask = (AFR_ICTX_READ_CHILD_MASK & read_child); - ctx->masks = remaining_mask | mask; -} +int +afr_accused_fill (xlator_t *this, dict_t *xdata, unsigned char *accused, + afr_transaction_type type) +{ + afr_private_t *priv = NULL; + int i = 0; + int idx = afr_index_for_transaction_type (type); + void *pending_raw = NULL; + int pending[3]; + int ret = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + ret = dict_get_ptr (xdata, priv->pending_key[i], + &pending_raw); + if (ret) /* no pending flags */ + continue; + memcpy (pending, pending_raw, sizeof(pending)); + + if (ntoh32 (pending[idx])) + accused[i] = 1; + } -void -afr_inode_ctx_set_read_ctx (afr_inode_ctx_t *ctx, int32_t read_child, - int32_t *fresh_children, int32_t child_count) -{ - int i = 0; - - afr_inode_ctx_set_read_child (ctx, read_child); - for (i = 0; i < child_count; i++) { - if (fresh_children) - ctx->fresh_children[i] = fresh_children[i]; - else - ctx->fresh_children[i] = -1; - } + return 0; } -void -afr_inode_ctx_rm_stale_children (afr_inode_ctx_t *ctx, int32_t *stale_children, - int32_t child_count) + +int +afr_accuse_smallfiles (xlator_t *this, struct afr_reply *replies, + unsigned char *data_accused) { - int i = 0; - int32_t read_child = -1; + int i = 0; + afr_private_t *priv = NULL; + uint64_t maxsize = 0; - GF_ASSERT (stale_children); - for (i = 0; i < child_count; i++) { - if (stale_children[i] == -1) - break; - afr_children_rm_child (ctx->fresh_children, - stale_children[i], child_count); - } - read_child = (int32_t)(ctx->masks & AFR_ICTX_READ_CHILD_MASK); - if (!afr_is_child_present (ctx->fresh_children, child_count, - read_child)) - afr_inode_ctx_set_read_child (ctx, ctx->fresh_children[0]); -} + priv = this->private; -void -afr_inode_ctx_set_opendir_done (afr_inode_ctx_t *ctx) -{ - uint64_t remaining_mask = 0; - uint64_t mask = 0; + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) + continue; + if (replies[i].poststat.ia_size > maxsize) + maxsize = replies[i].poststat.ia_size; + } - remaining_mask = (~AFR_ICTX_OPENDIR_DONE_MASK & ctx->masks); - mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_OPENDIR_DONE_MASK); - ctx->masks = remaining_mask | mask; + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) + continue; + if (replies[i].poststat.ia_size < maxsize) + data_accused[i] = 1; + } + + return 0; } -void -afr_inode_set_ctx_params (xlator_t *this, inode_t *inode, - afr_inode_params_t *params) -{ - GF_ASSERT (inode); - GF_ASSERT (params); - afr_inode_ctx_t *ctx = NULL; - afr_private_t *priv = NULL; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - int32_t *stale_children = NULL; +int +afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int event_generation = 0; + int i = 0; + unsigned char *data_accused = NULL; + unsigned char *metadata_accused = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + int ret = 0; - priv = this->private; - LOCK (&inode->lock); - { - ctx = __afr_inode_ctx_get (inode, this); - if (!ctx) - goto unlock; - switch (params->op) { - case AFR_INODE_SET_READ_CTX: - read_child = params->u.read_ctx.read_child; - fresh_children = params->u.read_ctx.children; - afr_inode_ctx_set_read_ctx (ctx, read_child, - fresh_children, - priv->child_count); - break; - case AFR_INODE_RM_STALE_CHILDREN: - stale_children = params->u.read_ctx.children; - afr_inode_ctx_rm_stale_children (ctx, - stale_children, - priv->child_count); - break; - case AFR_INODE_SET_OPENDIR_DONE: - afr_inode_ctx_set_opendir_done (ctx); - break; - default: - GF_ASSERT (0); - break; - } - } -unlock: - UNLOCK (&inode->lock); -} + local = frame->local; + priv = this->private; + replies = local->replies; + event_generation = local->event_generation; + + data_accused = alloca0 (priv->child_count); + data_readable = alloca0 (priv->child_count); + metadata_accused = alloca0 (priv->child_count); + metadata_readable = alloca0 (priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + data_readable[i] = 1; + metadata_readable[i] = 1; + } -void -afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, - afr_spb_state_t data_spb) -{ - afr_inode_ctx_t *ctx = NULL; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) { + data_readable[i] = 0; + metadata_readable[i] = 0; + continue; + } + + if (replies[i].op_ret == -1) { + data_readable[i] = 0; + metadata_readable[i] = 0; + continue; + } + + afr_accused_fill (this, replies[i].xdata, data_accused, + (inode->ia_type == IA_IFDIR) ? + AFR_ENTRY_TRANSACTION : AFR_DATA_TRANSACTION); + + afr_accused_fill (this, replies[i].xdata, + metadata_accused, AFR_METADATA_TRANSACTION); + + } - ctx = afr_inode_ctx_get (inode, this); - if (mdata_spb != DONT_KNOW) - ctx->mdata_spb = mdata_spb; - if (data_spb != DONT_KNOW) - ctx->data_spb = data_spb; + if (inode->ia_type != IA_IFDIR) + afr_accuse_smallfiles (this, replies, data_accused); + + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) { + data_readable[i] = 0; + ret = 1; + } + if (metadata_accused[i]) { + metadata_readable[i] = 0; + ret = 1; + } + } + + afr_inode_read_subvol_set (inode, this, data_readable, + metadata_readable, event_generation); + return ret; } -void -afr_set_opendir_done (xlator_t *this, inode_t *inode) -{ - afr_inode_params_t params = {0}; - params.op = AFR_INODE_SET_OPENDIR_DONE; - afr_inode_set_ctx_params (this, inode, ¶ms); + +int +afr_refresh_selfheal_done (int ret, call_frame_t *heal, void *opaque) +{ + if (heal) + STACK_DESTROY (heal->root); + return 0; } -void -afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, - int32_t *fresh_children) +int +afr_inode_refresh_err (call_frame_t *frame, xlator_t *this) { - afr_inode_params_t params = {0}; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int err = 0; - priv = this->private; - GF_ASSERT (read_child >= 0); - GF_ASSERT (fresh_children); - GF_ASSERT (afr_is_child_present (fresh_children, priv->child_count, - read_child)); - - params.op = AFR_INODE_SET_READ_CTX; - params.u.read_ctx.read_child = read_child; - params.u.read_ctx.children = fresh_children; - afr_inode_set_ctx_params (this, inode, ¶ms); + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && !local->replies[i].op_ret) { + err = 0; + goto ret; + } + } + + err = afr_final_errno (local, priv); +ret: + return -err; } -void -afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, - int32_t *stale_children) + +int +afr_refresh_selfheal_wrap (void *opaque) { - afr_inode_params_t params = {0}; + call_frame_t *frame = opaque; + afr_local_t *local = NULL; + xlator_t *this = NULL; + int err = 0; + + local = frame->local; + this = frame->this; - GF_ASSERT (stale_children); + afr_selfheal (frame->this, local->refreshinode->gfid); - params.op = AFR_INODE_RM_STALE_CHILDREN; - params.u.read_ctx.children = stale_children; - afr_inode_set_ctx_params (this, inode, ¶ms); + afr_selfheal_unlocked_discover (frame, local->refreshinode, + local->refreshinode->gfid, + local->replies); + + afr_replies_interpret (frame, this, local->refreshinode); + + err = afr_inode_refresh_err (frame, this); + + afr_replies_wipe (local, this->private); + + local->refreshfn (frame, this, err); + + return 0; } + gf_boolean_t -afr_is_source_child (int32_t *sources, int32_t child_count, int32_t child) +afr_selfheal_enabled (xlator_t *this) { - gf_boolean_t source_xattrs = _gf_false; + afr_private_t *priv = NULL; + gf_boolean_t data = _gf_false; - GF_ASSERT (child < child_count); + priv = this->private; - if ((child >= 0) && (child < child_count) && - sources[child]) { - source_xattrs = _gf_true; - } - return source_xattrs; + gf_string2boolean (priv->data_self_heal, &data); + + return data || priv->metadata_self_heal || priv->entry_self_heal; } -gf_boolean_t -afr_is_child_present (int32_t *success_children, int32_t child_count, - int32_t child) + + +int +afr_inode_refresh_done (call_frame_t *frame, xlator_t *this) { - gf_boolean_t success_child = _gf_false; - int i = 0; + call_frame_t *heal = NULL; + afr_local_t *local = NULL; + int ret = 0; + int err = 0; - GF_ASSERT (child < child_count); + local = frame->local; - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - if (child == success_children[i]) { - success_child = _gf_true; - break; - } - } - return success_child; + ret = afr_replies_interpret (frame, this, local->refreshinode); + + err = afr_inode_refresh_err (frame, this); + + afr_replies_wipe (local, this->private); + + if (ret && afr_selfheal_enabled (this)) { + heal = copy_frame (frame); + if (heal) + heal->root->pid = -1; + ret = synctask_new (this->ctx->env, afr_refresh_selfheal_wrap, + afr_refresh_selfheal_done, heal, frame); + if (ret) + goto refresh_done; + } else { + refresh_done: + local->refreshfn (frame, this, err); + } + + return 0; } -gf_boolean_t -afr_is_read_child (int32_t *success_children, int32_t *sources, - int32_t child_count, int32_t child) + +int +afr_inode_refresh_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *par) { - gf_boolean_t success_child = _gf_false; - gf_boolean_t source = _gf_false; + afr_local_t *local = NULL; + int call_child = (long) cookie; + int call_count = 0; - if (child < 0) { - return _gf_false; - } + local = frame->local; - GF_ASSERT (success_children); - GF_ASSERT (child_count > 0); + local->replies[call_child].valid = 1; + local->replies[call_child].op_ret = op_ret; + local->replies[call_child].op_errno = op_errno; + if (op_ret != -1) { + local->replies[call_child].poststat = *buf; + local->replies[call_child].postparent = *par; + local->replies[call_child].xdata = dict_ref (xdata); + } - success_child = afr_is_child_present (success_children, child_count, - child); - if (!success_child) - goto out; - if (NULL == sources) { - source = _gf_true; - goto out; - } - source = afr_is_source_child (sources, child_count, child); -out: - return (success_child && source); + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_inode_refresh_done (frame, this); + + return 0; } -int32_t -afr_hash_child (int32_t *success_children, int32_t child_count, - unsigned int hmode, uuid_t gfid) + +int +afr_inode_refresh_subvol (call_frame_t *frame, xlator_t *this, int i, + inode_t *inode, dict_t *xdata) { - uuid_t gfid_copy = {0,}; - pid_t pid; + loc_t loc = {0, }; + afr_private_t *priv = NULL; - if (!hmode) { - return -1; - } + priv = this->private; - if (gfid) { - uuid_copy(gfid_copy,gfid); - } - if (hmode > 1) { - /* - * Why getpid? Because it's one of the cheapest calls - * available - faster than gethostname etc. - and returns a - * constant-length value that's sure to be shorter than a UUID. - * It's still very unlikely to be the same across clients, so - * it still provides good mixing. We're not trying for - * perfection here. All we need is a low probability that - * multiple clients won't converge on the same subvolume. - */ - pid = getpid(); - memcpy (gfid_copy, &pid, sizeof(pid)); - } + loc.inode = inode; + uuid_copy (loc.gfid, inode->gfid); - return SuperFastHash((char *)gfid_copy, - sizeof(gfid_copy)) % child_count; + STACK_WIND_COOKIE (frame, afr_inode_refresh_subvol_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->lookup, &loc, xdata); + return 0; } -/* If sources is NULL the xattrs are assumed to be of source for all - * success_children. - */ + int -afr_select_read_child_from_policy (int32_t *success_children, - int32_t child_count, int32_t prev_read_child, - int32_t config_read_child, int32_t *sources, - unsigned int hmode, uuid_t gfid) +afr_inode_refresh_do (call_frame_t *frame, xlator_t *this) { - int32_t read_child = -1; - int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t *xdata = NULL; - GF_ASSERT (success_children); + priv = this->private; + local = frame->local; - read_child = config_read_child; - if (afr_is_read_child (success_children, sources, child_count, - read_child)) - goto out; + afr_replies_wipe (local, priv); - read_child = prev_read_child; - if (afr_is_read_child (success_children, sources, child_count, - read_child)) - goto out; + xdata = dict_new (); + if (!xdata) { + afr_inode_refresh_done (frame, this); + return 0; + } - read_child = afr_hash_child (success_children, child_count, - hmode, gfid); - if (afr_is_read_child (success_children, sources, child_count, - read_child)) { - goto out; - } + if (afr_xattr_req_prepare (this, xdata) != 0) { + dict_unref (xdata); + afr_inode_refresh_done (frame, this); + return 0; + } - for (i = 0; i < child_count; i++) { - read_child = success_children[i]; - if (read_child < 0) - break; - if (afr_is_read_child (success_children, sources, child_count, - read_child)) - goto out; - } - read_child = -1; + local->call_count = AFR_COUNT (local->child_up, priv->child_count); -out: - return read_child; + call_count = local->call_count; + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + + afr_inode_refresh_subvol (frame, this, i, local->refreshinode, + xdata); + + if (!--call_count) + break; + } + + dict_unref (xdata); + + return 0; } -/* This function should be used when all the success_children are sources - */ -void -afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, - int32_t *fresh_children, int32_t prev_read_child, - int32_t config_read_child, uuid_t gfid) + +int +afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_inode_refresh_cbk_t refreshfn) { - int read_child = -1; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; - priv = this->private; - read_child = afr_select_read_child_from_policy (fresh_children, - priv->child_count, - prev_read_child, - config_read_child, - NULL, - priv->hash_mode, gfid); - if (read_child >= 0) - afr_inode_set_read_ctx (this, inode, read_child, - fresh_children); + local = frame->local; + + local->refreshfn = refreshfn; + + if (local->refreshinode) { + inode_unref (local->refreshinode); + local->refreshinode = NULL; + } + + local->refreshinode = inode_ref (inode); + + afr_inode_refresh_do (frame, this); + + return 0; } -/* afr_next_call_child () - * This is a common function used by all the read-type fops - * This function should not be called with the inode's read_children array. - * The fop's handler should make a copy of the inode's read_children, - * preferred read_child into the local vars, because while this function is - * in execution there is a chance for inode's read_ctx to change. - */ -int32_t -afr_next_call_child (int32_t *fresh_children, unsigned char *child_up, - size_t child_count, int32_t *last_index, - int32_t read_child) + +int +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req) { - int next_index = 0; - int32_t next_call_child = -1; + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; - GF_ASSERT (last_index); + priv = this->private; - next_index = *last_index; -retry: - next_index++; - if ((next_index >= child_count) || - (fresh_children[next_index] == -1)) - goto out; - if ((fresh_children[next_index] == read_child) || - (!child_up[fresh_children[next_index]])) - goto retry; - *last_index = next_index; - next_call_child = fresh_children[next_index]; -out: - return next_call_child; + for (i = 0; i < priv->child_count; i++) { + ret = dict_set_uint64 (xattr_req, priv->pending_key[i], + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret < 0) + gf_log (this->name, GF_LOG_WARNING, + "Unable to set dict value for %s", + priv->pending_key[i]); + /* 3 = data+metadata+entry */ + } + ret = dict_set_uint64 (xattr_req, AFR_DIRTY, + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "failed to set dirty " + "query flag"); + } + + return ret; } - /* This function should not be called with the inode's read_children array. - * The fop's handler should make a copy of the inode's read_children, - * preferred read_child into the local vars, because while this function is - * in execution there is a chance for inode's read_ctx to change. - */ -int32_t -afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, - int32_t *fresh_children, - int32_t *call_child, int32_t *last_index) +int +afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this, + dict_t *xattr_req, loc_t *loc) { - int ret = 0; - afr_private_t *priv = NULL; - int i = 0; - - GF_ASSERT (child_up); - GF_ASSERT (call_child); - GF_ASSERT (last_index); - GF_ASSERT (fresh_children); + int ret = -ENOMEM; - if (read_child < 0) { - ret = -EIO; + local->xattr_req = dict_new (); + if (!local->xattr_req) goto out; - } - priv = this->private; - *call_child = -1; - *last_index = -1; + if (xattr_req) + dict_copy (xattr_req, local->xattr_req); - if (child_up[read_child]) { - *call_child = read_child; - } else { - for (i = 0; i < priv->child_count; i++) { - if (fresh_children[i] == -1) - break; - if (child_up[fresh_children[i]]) { - *call_child = fresh_children[i]; - ret = 0; - break; - } - } + ret = afr_xattr_req_prepare (this, local->xattr_req); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to prepare xattr_req", loc->path); + } - if (*call_child == -1) { - ret = -ENOTCONN; - goto out; - } + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_INODELK_COUNT); + } + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_ENTRYLK_COUNT); + } - *last_index = i; + ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_PARENT_ENTRYLK); } + + ret = 0; out: - gf_log (this->name, GF_LOG_DEBUG, "Returning %d, call_child: %d, " - "last_index: %d", ret, *call_child, *last_index); return ret; } -void -afr_reset_xattr (dict_t **xattr, unsigned int child_count) + +int +afr_hash_child (inode_t *inode, int32_t child_count, int hashmode) { - unsigned int i = 0; + uuid_t gfid_copy = {0,}; + pid_t pid; - if (!xattr) - goto out; - for (i = 0; i < child_count; i++) { - if (xattr[i]) { - dict_unref (xattr[i]); - xattr[i] = NULL; - } + if (!hashmode) { + return -1; } -out: - return; -} -void -afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count) -{ - afr_reset_xattr (xattr, child_count); - GF_FREE (xattr); -} + if (inode) { + uuid_copy (gfid_copy, inode->gfid); + } -void -afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) -{ - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; + if (hashmode > 1) { + /* + * Why getpid? Because it's one of the cheapest calls + * available - faster than gethostname etc. - and returns a + * constant-length value that's sure to be shorter than a UUID. + * It's still very unlikely to be the same across clients, so + * it still provides good mixing. We're not trying for + * perfection here. All we need is a low probability that + * multiple clients won't converge on the same subvolume. + */ + pid = getpid(); + memcpy (gfid_copy, &pid, sizeof(pid)); + } - sh = &local->self_heal; - priv = this->private; + return SuperFastHash((char *)gfid_copy, + sizeof(gfid_copy)) % child_count; +} - if (sh->data_sh_info && strcmp (sh->data_sh_info, "")) - GF_FREE (sh->data_sh_info); - if (sh->metadata_sh_info && strcmp (sh->metadata_sh_info, "")) - GF_FREE (sh->metadata_sh_info); +int +afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, + unsigned char *readable) +{ + afr_private_t *priv = NULL; + int read_subvol = -1; + int i = 0; - GF_FREE (sh->buf); + priv = this->private; - GF_FREE (sh->parentbufs); + /* first preference - explicitly specified or local subvolume */ + if (priv->read_child >= 0 && readable[priv->read_child]) + return priv->read_child; - if (sh->inode) - inode_unref (sh->inode); + /* second preference - use hashed mode */ + read_subvol = afr_hash_child (inode, priv->child_count, + priv->hash_mode); + if (read_subvol >= 0 && readable[read_subvol]) + return read_subvol; - afr_xattr_array_destroy (sh->xattr, priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (readable[i]) + return i; + } - GF_FREE (sh->child_errno); + /* no readable subvolumes, either split brain or all subvols down */ - afr_matrix_cleanup (sh->pending_matrix, priv->child_count); - afr_matrix_cleanup (sh->delta_matrix, priv->child_count); + return -1; +} - GF_FREE (sh->sources); - GF_FREE (sh->success); +int +afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, + int type) +{ + int ret = -1; - GF_FREE (sh->locked_nodes); + if (type == AFR_METADATA_TRANSACTION) + ret = afr_inode_read_subvol_get (inode, this, 0, readable, + event_p); + else + ret = afr_inode_read_subvol_get (inode, this, readable, 0, + event_p); + return ret; +} - if (sh->healing_fd) { - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; - } - GF_FREE ((char *)sh->linkname); +int +afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p, + int *event_p, afr_transaction_type type) +{ + afr_private_t *priv = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + unsigned char *readable = NULL; + unsigned char *intersection = NULL; + int subvol = -1; + int event = 0; - GF_FREE (sh->success_children); + priv = this->private; - GF_FREE (sh->fresh_children); + readable = alloca0 (priv->child_count); + data_readable = alloca0 (priv->child_count); + metadata_readable = alloca0 (priv->child_count); + intersection = alloca0 (priv->child_count); - GF_FREE (sh->fresh_parent_dirs); + afr_inode_read_subvol_type_get (inode, this, readable, &event, type); - loc_wipe (&sh->parent_loc); - loc_wipe (&sh->lookup_loc); + afr_inode_read_subvol_get (inode, this, data_readable, metadata_readable, + &event); - GF_FREE (sh->checksum); + AFR_INTERSECT (intersection, data_readable, metadata_readable, + priv->child_count); - GF_FREE (sh->write_needed); - if (sh->healing_fd) - fd_unref (sh->healing_fd); + if (AFR_COUNT (intersection, priv->child_count) > 0) + subvol = afr_read_subvol_select_by_policy (inode, this, + intersection); + else + subvol = afr_read_subvol_select_by_policy (inode, this, + readable); + if (subvol_p) + *subvol_p = subvol; + if (event_p) + *event_p = event; + return subvol; } @@ -838,8 +848,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) priv = this->private; afr_matrix_cleanup (local->pending, priv->child_count); - afr_matrix_cleanup (local->transaction.txn_changelog, - priv->child_count); GF_FREE (local->internal_lock.locked_nodes); @@ -860,7 +868,25 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) loc_wipe (&local->transaction.parent_loc); loc_wipe (&local->transaction.new_parent_loc); - GF_FREE (local->transaction.postop_piggybacked); +} + + +void +afr_replies_wipe (afr_local_t *local, afr_private_t *priv) +{ + int i; + + if (!local->replies) + return; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].xdata) { + dict_unref (local->replies[i].xdata); + local->replies[i].xdata = NULL; + } + } + + memset (local->replies, 0, sizeof(*local->replies) * priv->child_count); } @@ -872,7 +898,7 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (!local) return; - afr_local_sh_cleanup (local, this); + syncbarrier_destroy (&local->barrier); afr_local_transaction_cleanup (local, this); @@ -890,40 +916,26 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (local->dict) dict_unref (local->dict); + afr_replies_wipe (local, priv); GF_FREE(local->replies); GF_FREE (local->child_up); - GF_FREE (local->child_errno); + GF_FREE (local->read_attempted); - GF_FREE (local->fresh_children); + GF_FREE (local->readable); - { /* lookup */ - if (local->cont.lookup.xattrs) { - afr_reset_xattr (local->cont.lookup.xattrs, - priv->child_count); - GF_FREE (local->cont.lookup.xattrs); - local->cont.lookup.xattrs = NULL; - } + if (local->inode) + inode_unref (local->inode); - if (local->cont.lookup.xattr) { - dict_unref (local->cont.lookup.xattr); - } - - if (local->cont.lookup.inode) { - inode_unref (local->cont.lookup.inode); - } + if (local->parent) + inode_unref (local->parent); - GF_FREE (local->cont.lookup.postparents); + if (local->parent2) + inode_unref (local->parent2); - GF_FREE (local->cont.lookup.bufs); - - GF_FREE (local->cont.lookup.success_children); - - GF_FREE (local->cont.lookup.sources); - afr_matrix_cleanup (local->cont.lookup.pending_matrix, - priv->child_count); - } + if (local->refreshinode) + inode_unref (local->refreshinode); { /* getxattr */ GF_FREE (local->cont.getxattr.name); @@ -1018,67 +1030,29 @@ afr_frame_return (call_frame_t *frame) return call_count; } -int -afr_set_elem_count_get (unsigned char *elems, int child_count) -{ - int i = 0; - int ret = 0; - - for (i = 0; i < child_count; i++) - if (elems[i]) - ret++; - return ret; -} - -/** - * up_children_count - return the number of children that are up - */ - -unsigned int -afr_up_children_count (unsigned char *child_up, unsigned int child_count) -{ - return afr_set_elem_count_get (child_up, child_count); -} - -unsigned int -afr_locked_children_count (unsigned char *children, unsigned int child_count) -{ - return afr_set_elem_count_get (children, child_count); -} - -unsigned int -afr_pre_op_done_children_count (unsigned char *pre_op, - unsigned int child_count) -{ - return afr_set_elem_count_get (pre_op, child_count); -} gf_boolean_t -afr_is_fresh_lookup (loc_t *loc, xlator_t *this) -{ - uint64_t ctx = 0; - int32_t ret = 0; - - GF_ASSERT (loc); - GF_ASSERT (this); - GF_ASSERT (loc->inode); +afr_is_entry_possibly_under_txn (afr_local_t *local, xlator_t *this) +{ + int i = 0; + int tmp = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].xdata) + continue; + if (dict_get_int32 (local->replies[i].xdata, + GLUSTERFS_PARENT_ENTRYLK, + &tmp) == 0) + if (tmp) + return _gf_true; + } - ret = inode_ctx_get (loc->inode, this, &ctx); - if (0 == ret) - return _gf_false; - return _gf_true; + return _gf_false; } -void -afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) -{ - GF_ASSERT (loc); - GF_ASSERT (buf); - - uuid_copy (loc->gfid, buf->ia_gfid); - if (postparent) - uuid_copy (loc->pargfid, postparent->ia_gfid); -} /* * Quota size xattrs are not maintained by afr. There is a @@ -1090,1467 +1064,845 @@ afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) * */ static void -afr_handle_quota_size (afr_local_t *local, xlator_t *this, - dict_t *rsp_dict) +afr_handle_quota_size (call_frame_t *frame, xlator_t *this) { - int32_t *sources = NULL; - dict_t *xattr = NULL; - data_t *max_data = NULL; - int64_t max_quota_size = -1; - data_t *data = NULL; - int64_t *size = NULL; - int64_t quota_size = -1; - afr_private_t *priv = NULL; - int i = 0; - int ret = -1; - gf_boolean_t source_present = _gf_false; - - priv = this->private; - sources = local->cont.lookup.sources; - - if (rsp_dict == NULL) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "%s: Invalid " - "response dictionary", local->loc.path); - return; - } - - for (i = 0; i < priv->child_count; i++) { - if (sources[i]) { - source_present = _gf_true; - break; - } - } - - for (i = 0; i < priv->child_count; i++) { - /* - * If there is at least one source lets check - * for maximum quota sizes among sources, otherwise take the - * maximum of the ones present to be on the safer side. - */ - if (source_present && !sources[i]) - continue; - - xattr = local->cont.lookup.xattrs[i]; - if (!xattr) - continue; - - data = dict_get (xattr, QUOTA_SIZE_KEY); - if (!data) - continue; - - size = (int64_t*)data->data; - quota_size = ntoh64(*size); - gf_log (this->name, GF_LOG_DEBUG, "%s: %d, size: %"PRId64, - local->loc.path, i, quota_size); - if (quota_size > max_quota_size) { - if (max_data) - data_unref (max_data); - - max_quota_size = quota_size; - max_data = data_ref (data); - } - } + unsigned char *readable = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int i = 0; + uint64_t size = 0; + uint64_t max_size = 0; + int readable_cnt = 0; - if (max_data) { - ret = dict_set (rsp_dict, QUOTA_SIZE_KEY, max_data); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " - "quota size", local->loc.path); - } + local = frame->local; + priv = this->private; + replies = local->replies; + + readable = alloca0 (priv->child_count); + + afr_inode_read_subvol_get (local->inode, this, readable, 0, 0); + + readable_cnt = AFR_COUNT (readable, priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (readable_cnt && !readable[i]) + continue; + if (!replies[i].xdata) + continue; + if (dict_get_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, &size)) + continue; + if (size > max_size) + max_size = size; + } - data_unref (max_data); - } + if (!max_size) + return; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (readable_cnt && !readable[i]) + continue; + if (!replies[i].xdata) + continue; + if (dict_set_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, max_size)) + continue; + } } -int -afr_lookup_build_response_params (afr_local_t *local, xlator_t *this) -{ - struct iatt *buf = NULL; - struct iatt *postparent = NULL; - dict_t **xattr = NULL; - int32_t *success_children = NULL; - int32_t *sources = NULL; - afr_private_t *priv = NULL; - int32_t read_child = -1; - int ret = 0; - int i = 0; - - GF_ASSERT (local); - - buf = &local->cont.lookup.buf; - postparent = &local->cont.lookup.postparent; - xattr = &local->cont.lookup.xattr; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->cont.lookup.inode, - local->fresh_children); - if (read_child < 0) { - ret = -1; - goto out; - } - success_children = local->cont.lookup.success_children; - sources = local->cont.lookup.sources; - memset (sources, 0, sizeof (*sources) * priv->child_count); - afr_children_intersection_get (local->fresh_children, success_children, - sources, priv->child_count); - if (!sources[read_child]) { - read_child = -1; - for (i = 0; i < priv->child_count; i++) { - if (sources[i]) { - read_child = i; - break; - } - } - } - if (read_child < 0) { - ret = -1; - goto out; - } - - gf_log (this->name, GF_LOG_DEBUG, "Building lookup response from %d", - read_child); - if (!*xattr) - *xattr = dict_ref (local->cont.lookup.xattrs[read_child]); - - *buf = local->cont.lookup.bufs[read_child]; - *postparent = local->cont.lookup.postparents[read_child]; - - if (dict_get (local->xattr_req, QUOTA_SIZE_KEY)) - afr_handle_quota_size (local, this, *xattr); - - if (IA_INVAL == local->cont.lookup.inode->ia_type) { - /* fix for RT #602 */ - local->cont.lookup.inode->ia_type = buf->ia_type; - } -out: - return ret; -} static void -afr_lookup_update_lk_counts (afr_local_t *local, xlator_t *this, - int child_index, dict_t *xattr) +afr_lookup_done (call_frame_t *frame, xlator_t *this) { - uint32_t inodelk_count = 0; - uint32_t entrylk_count = 0; - int ret = -1; - uint32_t parent_entrylk = 0; - - GF_ASSERT (local); - GF_ASSERT (this); - GF_ASSERT (xattr); - GF_ASSERT (child_index >= 0); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = -1; + int op_errno = 0; + int read_subvol = 0; + unsigned char *readable = NULL; + int event = 0; + struct afr_reply *replies = NULL; + uuid_t read_gfid = {0, }; + gf_boolean_t locked_entry = _gf_false; + gf_boolean_t can_interpret = _gf_true; - ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT, - &inodelk_count); - if (ret == 0) - local->inodelk_count += inodelk_count; + priv = this->private; + local = frame->local; + replies = local->replies; - ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT, - &entrylk_count); - if (ret == 0) - local->entrylk_count += entrylk_count; - ret = dict_get_uint32 (xattr, GLUSTERFS_PARENT_ENTRYLK, - &parent_entrylk); - if (!ret) - local->cont.lookup.parent_entrylk += parent_entrylk; -} + locked_entry = afr_is_entry_possibly_under_txn (local, this); -/* - * It's important to maintain a commutative property on do_*_self_heal and - * found*; once set, they must not be cleared by a subsequent iteration or - * call, so that they represent a logical OR of all iterations and calls - * regardless of child/key order. That allows the caller to call us multiple - * times without having to use a separate variable as a "reduce" accumulator. - */ -static void -afr_lookup_set_self_heal_params_by_xattr (afr_local_t *local, xlator_t *this, - dict_t *xattr) -{ - afr_private_t *priv = NULL; - int i = 0; - int ret = -1; - void *pending_raw = NULL; - int32_t *pending = NULL; + readable = alloca0 (priv->child_count); - GF_ASSERT (local); - GF_ASSERT (this); - GF_ASSERT (xattr); + afr_inode_read_subvol_get (local->loc.parent, this, readable, + NULL, &event); - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); - if (ret != 0) { - continue; - } - pending = pending_raw; + /* First, check if we have an ESTALE from somewhere, + If so, propagate that so that a revalidate can be + issued + */ + op_errno = afr_final_errno (frame->local, this->private); + local->op_errno = op_errno; + if (op_errno == ESTALE) { + local->op_errno = op_errno; + local->op_ret = -1; + goto unwind; + } - if (pending[AFR_METADATA_TRANSACTION]) { - gf_log(this->name, GF_LOG_DEBUG, - "metadata self-heal is pending for %s.", - local->loc.path); - local->self_heal.do_metadata_self_heal = _gf_true; - } + read_subvol = -1; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (locked_entry && replies[i].op_ret == -1 && + replies[i].op_errno == ENOENT) { + /* Second, check entry is still + "underway" in creation */ + local->op_ret = -1; + local->op_errno = ENOENT; + read_subvol = i; + goto unwind; + } - if (pending[AFR_ENTRY_TRANSACTION]) { - gf_log(this->name, GF_LOG_DEBUG, - "entry self-heal is pending for %s.", - local->loc.path); - local->self_heal.do_entry_self_heal = _gf_true; - } + if (replies[i].op_ret == -1) + continue; - if (pending[AFR_DATA_TRANSACTION]) { - gf_log(this->name, GF_LOG_DEBUG, - "data self-heal is pending for %s.", - local->loc.path); - local->self_heal.do_data_self_heal = _gf_true; - } - } -} + if (read_subvol == -1 || !readable[read_subvol]) { + read_subvol = i; + uuid_copy (read_gfid, replies[i].poststat.ia_gfid); + local->op_ret = 0; + } + } -void -afr_lookup_check_set_metadata_split_brain (afr_local_t *local, xlator_t *this) -{ - int32_t *sources = NULL; - afr_private_t *priv = NULL; - int32_t subvol_status = 0; - int32_t *success_children = NULL; - dict_t **xattrs = NULL; - struct iatt *bufs = NULL; - int32_t **pending_matrix = NULL; + if (read_subvol == -1) + goto unwind; + /* We now have a read_subvol, which is readable[] (if there + were any). Next we look for GFID mismatches. We don't + consider a GFID mismatch as an error if read_subvol is + readable[] but the mismatching GFID subvol is not. + */ + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) { + if (priv->child_up[i]) + can_interpret = _gf_false; + continue; + } - priv = this->private; + if (!uuid_compare (replies[i].poststat.ia_gfid, + read_gfid)) + continue; - sources = GF_CALLOC (priv->child_count, sizeof (*sources), - gf_afr_mt_int32_t); - if (NULL == sources) - goto out; - success_children = local->cont.lookup.success_children; - xattrs = local->cont.lookup.xattrs; - bufs = local->cont.lookup.bufs; - pending_matrix = local->cont.lookup.pending_matrix; - afr_build_sources (this, xattrs, bufs, pending_matrix, - sources, success_children, AFR_METADATA_TRANSACTION, - &subvol_status, _gf_false); - if (subvol_status & SPLIT_BRAIN) - local->cont.lookup.possible_spb = _gf_true; -out: - GF_FREE (sources); -} + can_interpret = _gf_false; -static void -afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this, - struct iatt *buf, struct iatt *lookup_buf) -{ - if (PERMISSION_DIFFERS (buf, lookup_buf)) { - /* mismatching permissions */ - gf_log (this->name, GF_LOG_DEBUG, - "permissions differ for %s ", local->loc.path); - local->self_heal.do_metadata_self_heal = _gf_true; - } + if (locked_entry) + continue; - if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { - /* mismatching permissions */ - local->self_heal.do_metadata_self_heal = _gf_true; - gf_log (this->name, GF_LOG_DEBUG, - "ownership differs for %s ", local->loc.path); - } + /* Now GFIDs mismatch. It's OK as long as this subvol + is not readable[] but read_subvol is */ + if (readable[read_subvol] && !readable[i]) + continue; - if (SIZE_DIFFERS (buf, lookup_buf) - && IA_ISREG (buf->ia_type)) { - gf_log (this->name, GF_LOG_DEBUG, - "size differs for %s ", local->loc.path); - local->self_heal.do_data_self_heal = _gf_true; - } + /* LOG ERROR */ + local->op_ret = -1; + local->op_errno = EIO; + goto unwind; + } - if (uuid_compare (buf->ia_gfid, lookup_buf->ia_gfid)) { - /* mismatching gfid */ - gf_log (this->name, GF_LOG_DEBUG, - "%s: gfid different on subvolume", local->loc.path); - } -} + /* Forth, for the finalized GFID, pick the best subvolume + to return stats from. + */ + if (can_interpret) { + /* It is safe to call afr_replies_interpret() because we have + a response from all the UP subvolumes and all of them resolved + to the same GFID + */ + if (afr_replies_interpret (frame, this, local->inode)) { + read_subvol = afr_data_subvol_get (local->inode, this, + 0, 0); + afr_inode_read_subvol_reset (local->inode, this); + goto cant_interpret; + } else { + read_subvol = afr_data_subvol_get (local->inode, this, + 0, 0); + } + } else { + cant_interpret: + if (read_subvol == -1) + dict_del (replies[0].xdata, GF_CONTENT_KEY); + else + dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY); + } -static void -afr_detect_self_heal_by_split_brain_status (afr_local_t *local, xlator_t *this) -{ - gf_boolean_t split_brain = _gf_false; - afr_self_heal_t *sh = NULL; + afr_handle_quota_size (frame, this); - sh = &local->self_heal; +unwind: + if (read_subvol == -1) + read_subvol = 0; - split_brain = afr_is_split_brain (this, local->cont.lookup.inode); - split_brain = split_brain || local->cont.lookup.possible_spb; - if ((local->success_count > 0) && split_brain && - IA_ISREG (local->cont.lookup.inode->ia_type)) { - sh->force_confirm_spb = _gf_true; - gf_log (this->name, GF_LOG_DEBUG, - "split brain detected during lookup of %s.", - local->loc.path); - } + AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->replies[read_subvol].poststat, + local->replies[read_subvol].xdata, + &local->replies[read_subvol].postparent); } -static void -afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this) -{ - GF_ASSERT (local); - GF_ASSERT (this); - - if ((local->success_count > 0) && (local->enoent_count > 0)) { - local->self_heal.do_metadata_self_heal = _gf_true; - local->self_heal.do_data_self_heal = _gf_true; - local->self_heal.do_entry_self_heal = _gf_true; - local->self_heal.do_gfid_self_heal = _gf_true; - local->self_heal.do_missing_entry_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "entries are missing in lookup of %s.", - local->loc.path); - } - - return; -} +/* + * During a lookup, some errors are more "important" than + * others in that they must be given higher priority while + * returning to the user. + * + * The hierarchy is ESTALE > ENOENT > others + */ -gf_boolean_t -afr_can_self_heal_proceed (afr_self_heal_t *sh, afr_private_t *priv) +int +afr_higher_errno (int32_t old_errno, int32_t new_errno) { - GF_ASSERT (sh); - GF_ASSERT (priv); + if (old_errno == ENODATA || new_errno == ENODATA) + return ENODATA; + if (old_errno == ESTALE || new_errno == ESTALE) + return ESTALE; + if (old_errno == ENOENT || new_errno == ENOENT) + return ENOENT; - if (sh->force_confirm_spb) - return _gf_true; - return (sh->do_gfid_self_heal - || sh->do_missing_entry_self_heal - || (afr_data_self_heal_enabled (priv->data_self_heal) && - sh->do_data_self_heal) - || (priv->metadata_self_heal && sh->do_metadata_self_heal) - || (priv->entry_self_heal && sh->do_entry_self_heal)); + return new_errno; } -afr_transaction_type -afr_transaction_type_get (ia_type_t ia_type) -{ - afr_transaction_type type = AFR_METADATA_TRANSACTION; - GF_ASSERT (ia_type != IA_INVAL); +int +afr_final_errno (afr_local_t *local, afr_private_t *priv) +{ + int i = 0; + int op_errno = 0; + int tmp_errno = 0; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret == 0) + continue; + tmp_errno = local->replies[i].op_errno; + op_errno = afr_higher_errno (op_errno, tmp_errno); + } - if (IA_ISDIR (ia_type)) { - type = AFR_ENTRY_TRANSACTION; - } else if (IA_ISREG (ia_type)) { - type = AFR_DATA_TRANSACTION; - } - return type; + return op_errno; } -int -afr_lookup_select_read_child (afr_local_t *local, xlator_t *this, - int32_t *read_child) +static int +get_pathinfo_host (char *pathinfo, char *hostname, size_t size) { - ia_type_t ia_type = IA_INVAL; - int32_t source = -1; - int ret = -1; - dict_t **xattrs = NULL; - int32_t *success_children = NULL; - afr_transaction_type type = AFR_METADATA_TRANSACTION; - uuid_t *gfid = NULL; - - GF_ASSERT (local); - GF_ASSERT (this); - GF_ASSERT (local->success_count > 0); + char *start = NULL; + char *end = NULL; + int ret = -1; + int i = 0; - success_children = local->cont.lookup.success_children; - /*We can take the success_children[0] only because we already - *handle the conflicting children other wise, we could select the - *read_child based on wrong file type - */ - ia_type = local->cont.lookup.bufs[success_children[0]].ia_type; - type = afr_transaction_type_get (ia_type); - xattrs = local->cont.lookup.xattrs; - gfid = &local->cont.lookup.buf.ia_gfid; - source = afr_lookup_select_read_child_by_txn_type (this, local, xattrs, - type, *gfid); - if (source < 0) { - gf_log (this->name, GF_LOG_DEBUG, "failed to select source " - "for %s", local->loc.path); + if (!pathinfo) + goto out; + + start = strchr (pathinfo, ':'); + if (!start) + goto out; + end = strrchr (pathinfo, ':'); + if (start == end) goto out; - } - gf_log (this->name, GF_LOG_DEBUG, "Source selected as %d for %s", - source, local->loc.path); - *read_child = source; + memset (hostname, 0, size); + i = 0; + while (++start != end) + hostname[i++] = *start; ret = 0; out: return ret; } -static inline gf_boolean_t -afr_is_transaction_running (afr_local_t *local) -{ - GF_ASSERT (local->fop == GF_FOP_LOOKUP); - return ((local->inodelk_count > 0) || (local->entrylk_count > 0)); -} - -void -afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, - gf_boolean_t background, ia_type_t ia_type, char *reason, - void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, - xlator_t *this), - int (*unwind) (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno, - int32_t sh_failed)) -{ - afr_local_t *local = NULL; - char sh_type_str[256] = {0,}; - char *bg = ""; - - GF_ASSERT (frame); - GF_ASSERT (this); - GF_ASSERT (inode); - GF_ASSERT (ia_type != IA_INVAL); - - local = frame->local; - local->self_heal.background = background; - local->self_heal.type = ia_type; - local->self_heal.unwind = unwind; - local->self_heal.gfid_sh_success_cbk = gfid_sh_success_cbk; - - afr_self_heal_type_str_get (&local->self_heal, - sh_type_str, - sizeof (sh_type_str)); - - if (background) - bg = "background"; - gf_log (this->name, GF_LOG_DEBUG, - "%s %s self-heal triggered. path: %s, reason: %s", bg, - sh_type_str, local->loc.path, reason); - - afr_self_heal (frame, this, inode); -} - -unsigned int -afr_gfid_missing_count (const char *xlator_name, int32_t *success_children, - struct iatt *bufs, unsigned int child_count, - const char *path) +int +afr_local_pathinfo (char *pathinfo, gf_boolean_t *local) { - unsigned int gfid_miss_count = 0; - int i = 0; - struct iatt *child1 = NULL; + int ret = 0; + char pathinfohost[1024] = {0}; + char localhost[1024] = {0}; + xlator_t *this = THIS; - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - child1 = &bufs[success_children[i]]; - if (uuid_is_null (child1->ia_gfid)) { - gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid is null" - " on subvolume %d", path, success_children[i]); - gfid_miss_count++; - } + *local = _gf_false; + ret = get_pathinfo_host (pathinfo, pathinfohost, sizeof (pathinfohost)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Invalid pathinfo: %s", + pathinfo); + goto out; } - return gfid_miss_count; -} - -static int -afr_lookup_gfid_missing_count (afr_local_t *local, xlator_t *this) -{ - int32_t *success_children = NULL; - afr_private_t *priv = NULL; - struct iatt *bufs = NULL; - int miss_count = 0; - - priv = this->private; - bufs = local->cont.lookup.bufs; - success_children = local->cont.lookup.success_children; - - miss_count = afr_gfid_missing_count (this->name, success_children, - bufs, priv->child_count, - local->loc.path); - return miss_count; -} - -gf_boolean_t -afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, - unsigned int child_count, const char *path, - const char *xlator_name) -{ - gf_boolean_t conflicting = _gf_false; - int i = 0; - struct iatt *child1 = NULL; - struct iatt *child2 = NULL; - uuid_t *gfid = NULL; - - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - child1 = &bufs[success_children[i]]; - if ((!gfid) && (!uuid_is_null (child1->ia_gfid))) - gfid = &child1->ia_gfid; - - if (i == 0) - continue; - - child2 = &bufs[success_children[i-1]]; - if (FILETYPE_DIFFERS (child1, child2)) { - gf_log (xlator_name, GF_LOG_DEBUG, "%s: filetype " - "differs on subvolumes (%d, %d)", path, - success_children[i-1], success_children[i]); - conflicting = _gf_true; - goto out; - } - if (!gfid || uuid_is_null (child1->ia_gfid)) - continue; - if (uuid_compare (*gfid, child1->ia_gfid)) { - gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid differs" - " on subvolume %d", path, success_children[i]); - conflicting = _gf_true; - goto out; - } + ret = gethostname (localhost, sizeof (localhost)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "gethostname() failed, " + "reason: %s", strerror (errno)); + goto out; } -out: - return conflicting; -} -/* afr_update_gfid_from_iatts: This function should be called only if the - * iatts are not conflicting. - */ -void -afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs, - int32_t *success_children, unsigned int child_count) -{ - uuid_t *gfid = NULL; - int i = 0; - int child = 0; - - for (i = 0; i < child_count; i++) { - child = success_children[i]; - if (child == -1) - break; - if ((!gfid) && (!uuid_is_null (bufs[child].ia_gfid))) { - gfid = &bufs[child].ia_gfid; - } else if (gfid && (!uuid_is_null (bufs[child].ia_gfid))) { - if (uuid_compare (*gfid, bufs[child].ia_gfid)) { - GF_ASSERT (0); - goto out; - } - } - } - if (gfid && (!uuid_is_null (*gfid))) - uuid_copy (uuid, *gfid); + if (!strcmp (localhost, pathinfohost)) + *local = _gf_true; out: - return; -} - -static gf_boolean_t -afr_lookup_conflicting_entries (afr_local_t *local, xlator_t *this) -{ - afr_private_t *priv = NULL; - gf_boolean_t conflict = _gf_false; - - priv = this->private; - conflict = afr_conflicting_iattrs (local->cont.lookup.bufs, - local->cont.lookup.success_children, - priv->child_count, local->loc.path, - this->name); - return conflict; -} - -gf_boolean_t -afr_open_only_data_self_heal (char *data_self_heal) -{ - return !strcmp (data_self_heal, "open"); + return ret; } -gf_boolean_t -afr_data_self_heal_enabled (char *data_self_heal) +static int32_t +afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - gf_boolean_t enabled = _gf_false; + int ret = 0; + char *pathinfo = NULL; + gf_boolean_t is_local = _gf_false; + afr_private_t *priv = NULL; + int32_t child_index = -1; - if (gf_string2boolean (data_self_heal, &enabled) == -1) { - enabled = !strcmp (data_self_heal, "open"); - GF_ASSERT (enabled); + if (op_ret != 0) { + goto out; } - return enabled; -} - -static void -afr_lookup_set_self_heal_params (afr_local_t *local, xlator_t *this) -{ - int i = 0; - struct iatt *bufs = NULL; - dict_t **xattr = NULL; - afr_private_t *priv = NULL; - int32_t child1 = -1; - int32_t child2 = -1; - afr_self_heal_t *sh = NULL; - - priv = this->private; - sh = &local->self_heal; - - afr_detect_self_heal_by_lookup_status (local, this); - - if (afr_lookup_gfid_missing_count (local, this)) - local->self_heal.do_gfid_self_heal = _gf_true; - - if (_gf_true == afr_lookup_conflicting_entries (local, this)) - local->self_heal.do_missing_entry_self_heal = _gf_true; - else - afr_update_gfid_from_iatts (local->self_heal.sh_gfid_req, - local->cont.lookup.bufs, - local->cont.lookup.success_children, - priv->child_count); - - bufs = local->cont.lookup.bufs; - for (i = 1; i < local->success_count; i++) { - child1 = local->cont.lookup.success_children[i-1]; - child2 = local->cont.lookup.success_children[i]; - afr_detect_self_heal_by_iatt (local, this, - &bufs[child1], &bufs[child2]); - } + priv = this->private; + child_index = (int32_t)(long)cookie; - xattr = local->cont.lookup.xattrs; - for (i = 0; i < local->success_count; i++) { - child1 = local->cont.lookup.success_children[i]; - afr_lookup_set_self_heal_params_by_xattr (local, this, - xattr[child1]); + ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); + if (ret != 0) { + goto out; } - if (afr_open_only_data_self_heal (priv->data_self_heal)) - sh->do_data_self_heal = _gf_false; - if (sh->do_metadata_self_heal) - afr_lookup_check_set_metadata_split_brain (local, this); - afr_detect_self_heal_by_split_brain_status (local, this); -} - -int -afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno, - int32_t sh_failed) -{ - afr_local_t *local = NULL; - int ret = -1; - dict_t *xattr = NULL; - - local = frame->local; - - if (op_ret == -1) { - local->op_ret = -1; - local->op_errno = afr_most_important_error(local->op_errno, - op_errno, _gf_true); + ret = afr_local_pathinfo (pathinfo, &is_local); + if (ret) { goto out; - } else { - local->op_ret = 0; } - afr_lookup_done_success_action (frame, this, _gf_true); - xattr = local->cont.lookup.xattr; - if (xattr) { - ret = dict_set_int32 (xattr, "sh-failed", sh_failed); - if (ret) - gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " - "sh-failed to %d", local->loc.path, sh_failed); - - if (local->self_heal.actual_sh_started == _gf_true && - sh_failed == 0) { - ret = dict_set_int32 (xattr, "actual-sh-done", 1); - if (ret) - gf_log(this->name, GF_LOG_ERROR, "%s: Failed to" - " set actual-sh-done to %d", - local->loc.path, - local->self_heal.actual_sh_started); - } + /* + * Note that one local subvolume will override another here. The only + * way to avoid that would be to retain extra information about whether + * the previous read_child is local, and it's just not worth it. Even + * the slowest local subvolume is far preferable to a remote one. + */ + if (is_local) { + gf_log (this->name, GF_LOG_INFO, + "selecting local read_child %s", + priv->children[child_index]->name); + priv->read_child = child_index; } out: - AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, - local->cont.lookup.inode, &local->cont.lookup.buf, - local->cont.lookup.xattr, - &local->cont.lookup.postparent); - + STACK_DESTROY(frame->root); return 0; } -//TODO: At the moment only lookup needs this, so not doing any checks, in the -// future we will have to do fop specific operations -void -afr_post_gfid_sh_success (call_frame_t *sh_frame, xlator_t *this) +static void +afr_attempt_local_discovery (xlator_t *this, int32_t child_index) { - afr_local_t *local = NULL; - afr_local_t *sh_local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - struct iatt *lookup_bufs = NULL; - struct iatt *lookup_parentbufs = NULL; - - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - local = sh->orig_frame->local; - lookup_bufs = local->cont.lookup.bufs; - lookup_parentbufs = local->cont.lookup.postparents; - priv = this->private; - - memcpy (lookup_bufs, sh->buf, priv->child_count * sizeof (*sh->buf)); - memcpy (lookup_parentbufs, sh->parentbufs, - priv->child_count * sizeof (*sh->parentbufs)); - - afr_reset_xattr (local->cont.lookup.xattrs, priv->child_count); - if (local->cont.lookup.xattr) { - dict_unref (local->cont.lookup.xattr); - local->cont.lookup.xattr = NULL; - } + call_frame_t *newframe = NULL; + loc_t tmploc = {0,}; + afr_private_t *priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) - local->cont.lookup.xattrs[i] = dict_ref (sh->xattr[i]); + newframe = create_frame(this,this->ctx->pool); + if (!newframe) { + return; } - afr_reset_children (local->cont.lookup.success_children, - priv->child_count); - afr_children_copy (local->cont.lookup.success_children, - sh->fresh_children, priv->child_count); + tmploc.gfid[sizeof(tmploc.gfid)-1] = 1; + STACK_WIND_COOKIE (newframe, afr_local_discovery_cbk, + (void *)(long)child_index, + priv->children[child_index], + priv->children[child_index]->fops->getxattr, + &tmploc, GF_XATTR_PATHINFO_KEY, NULL); } -static void -afr_lookup_perform_self_heal (call_frame_t *frame, xlator_t *this, - gf_boolean_t *sh_launched) -{ - unsigned int up_count = 0; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - char *reason = NULL; - - GF_ASSERT (sh_launched); - *sh_launched = _gf_false; - priv = this->private; - local = frame->local; - - up_count = afr_up_children_count (local->child_up, priv->child_count); - if (up_count == 1) { - gf_log (this->name, GF_LOG_DEBUG, - "Only 1 child up - do not attempt to detect self heal"); - goto out; - } - - afr_lookup_set_self_heal_params (local, this); - if (afr_can_self_heal_proceed (&local->self_heal, priv)) { - if (afr_is_transaction_running (local) && - /*Forcefully call afr_launch_self_heal (which will go on to - fail) for SB files.This prevents stale data being served - due to race in afr_is_transaction_running() when - multiple clients access the same SB file*/ - !local->cont.lookup.possible_spb && - (!local->attempt_self_heal)) - goto out; - reason = "lookup detected pending operations"; - afr_launch_self_heal (frame, this, local->cont.lookup.inode, - !local->foreground_self_heal, - local->cont.lookup.buf.ia_type, - reason, afr_post_gfid_sh_success, - afr_self_heal_lookup_unwind); - *sh_launched = _gf_true; - } -out: - return; -} - -void -afr_get_fresh_children (int32_t *success_children, int32_t *sources, - int32_t *fresh_children, unsigned int child_count) +int +afr_lookup_selfheal_wrap (void *opaque) { - unsigned int i = 0; - unsigned int j = 0; - - GF_ASSERT (success_children); - GF_ASSERT (sources); - GF_ASSERT (fresh_children); + call_frame_t *frame = opaque; + afr_local_t *local = NULL; + xlator_t *this = NULL; + inode_t *inode = NULL; - afr_reset_children (fresh_children, child_count); - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - if (afr_is_read_child (success_children, sources, child_count, - success_children[i])) { - fresh_children[j] = success_children[i]; - j++; - } - } -} + local = frame->local; + this = frame->this; -static int -afr_lookup_set_read_ctx (afr_local_t *local, xlator_t *this, int32_t read_child) -{ - afr_private_t *priv = NULL; + afr_selfheal_name (frame->this, local->loc.pargfid, local->loc.name); - GF_ASSERT (read_child >= 0); + afr_replies_wipe (local, this->private); - priv = this->private; - afr_get_fresh_children (local->cont.lookup.success_children, - local->cont.lookup.sources, - local->fresh_children, priv->child_count); - afr_inode_set_read_ctx (this, local->cont.lookup.inode, read_child, - local->fresh_children); + inode = afr_selfheal_unlocked_lookup_on (frame, local->loc.parent, + local->loc.name, local->replies, + local->child_up); + if (inode) + inode_unref (inode); + afr_lookup_done (frame, this); - return 0; + return 0; } + int -afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, - gf_boolean_t fail_conflict) +afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this) { - int32_t read_child = -1; - int32_t ret = -1; - afr_local_t *local = NULL; - gf_boolean_t fresh_lookup = _gf_false; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + call_frame_t *heal = NULL; + int i = 0, first = -1; + gf_boolean_t need_heal = _gf_false; + struct afr_reply *replies = NULL; + int ret = 0; - local = frame->local; - fresh_lookup = local->cont.lookup.fresh_lookup; + local = frame->local; + replies = local->replies; + priv = this->private; - if (local->loc.parent == NULL) - fail_conflict = _gf_true; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; - if (afr_lookup_conflicting_entries (local, this)) { - if (fail_conflict == _gf_false) - ret = 0; - goto out; - } + if (first == -1) { + first = i; + continue; + } - ret = afr_lookup_select_read_child (local, this, &read_child); - if (!afr_is_transaction_running (local) || fresh_lookup) { - if (read_child < 0) - goto out; + if (replies[i].op_ret != replies[first].op_ret) { + need_heal = _gf_true; + break; + } - ret = afr_lookup_set_read_ctx (local, this, read_child); - if (ret) - goto out; - } + if (uuid_compare (replies[i].poststat.ia_gfid, + replies[first].poststat.ia_gfid)) { + need_heal = _gf_true; + break; + } + } - ret = afr_lookup_build_response_params (local, this); - if (ret) - goto out; - afr_update_loc_gfids (&local->loc, - &local->cont.lookup.buf, - &local->cont.lookup.postparent); + if (need_heal) { + heal = copy_frame (frame); + if (heal) + heal->root->pid = -1; + ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap, + afr_refresh_selfheal_done, heal, frame); + if (ret) + goto lookup_done; + } else { + lookup_done: + afr_lookup_done (frame, this); + } - ret = 0; -out: - if (ret) { - local->op_ret = -1; - local->op_errno = EIO; - } - return ret; + return ret; } + int -afr_lookup_get_latest_subvol (afr_local_t *local, xlator_t *this) +afr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *buf, + dict_t *xdata, struct iatt *postparent) { - afr_private_t *priv = NULL; - int32_t *success_children = NULL; - struct iatt *bufs = NULL; - int i = 0; - int child = 0; - int lsubvol = -1; - - priv = this->private; - success_children = local->cont.lookup.success_children; - bufs = local->cont.lookup.bufs; - for (i = 0; i < priv->child_count; i++) { - child = success_children[i]; - if (child == -1) - break; - if (uuid_is_null (bufs[child].ia_gfid)) - continue; - if (lsubvol < 0) { - lsubvol = child; - } else if (bufs[lsubvol].ia_ctime < bufs[child].ia_ctime) { - lsubvol = child; - } else if ((bufs[lsubvol].ia_ctime == bufs[child].ia_ctime) && - (bufs[lsubvol].ia_ctime_nsec < bufs[child].ia_ctime_nsec)) { - lsubvol = child; - } - } - return lsubvol; -} + afr_local_t * local = NULL; + int call_count = -1; + int child_index = -1; -void -afr_lookup_mark_other_entries_stale (afr_local_t *local, xlator_t *this, - int subvol) -{ - afr_private_t *priv = NULL; - int32_t *success_children = NULL; - struct iatt *bufs = NULL; - int i = 0; - int child = 0; + child_index = (long) cookie; - priv = this->private; - success_children = local->cont.lookup.success_children; - bufs = local->cont.lookup.bufs; - memcpy (local->fresh_children, success_children, - sizeof (*success_children) * priv->child_count); - for (i = 0; i < priv->child_count; i++) { - child = local->fresh_children[i]; - if (child == -1) - break; - if (child == subvol) - continue; - if (uuid_is_null (bufs[child].ia_gfid) && - (bufs[child].ia_type == bufs[subvol].ia_type)) - continue; - afr_children_rm_child (success_children, child, - priv->child_count); - local->success_count--; - } - afr_reset_children (local->fresh_children, priv->child_count); -} + local = frame->local; -void -afr_succeed_lookup_on_latest_iatt (afr_local_t *local, xlator_t *this) -{ - int lsubvol = 0; + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (op_ret != -1) { + local->replies[child_index].poststat = *buf; + local->replies[child_index].postparent = *postparent; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + } - if (!afr_lookup_conflicting_entries (local, this)) - goto out; + call_count = afr_frame_return (frame); + if (call_count == 0) { + afr_lookup_entry_heal (frame, this); + } - lsubvol = afr_lookup_get_latest_subvol (local, this); - if (lsubvol < 0) - goto out; - afr_lookup_mark_other_entries_stale (local, this, lsubvol); -out: - return; + return 0; } -gf_boolean_t -afr_is_entry_possibly_under_creation (afr_local_t *local, xlator_t *this) -{ - /* - * We need to perform this test in lookup done and treat on going - * create/DELETE as ENOENT. - * Reason: - Multiple clients A, B and C are attempting 'mkdir -p /mnt/a/b/c' - - 1 Client A is in the middle of mkdir(/a). It has acquired lock. - It has performed mkdir(/a) on one subvol, and second one is still - in progress - 2 Client B performs a lookup, sees directory /a on one, - ENOENT on the other, succeeds lookup. - 3 Client B performs lookup on /a/b on both subvols, both return ENOENT - (one subvol because /a/b does not exist, another because /a - itself does not exist) - 4 Client B proceeds to mkdir /a/b. It obtains entrylk on inode=/a with - basename=b on one subvol, but fails on other subvol as /a is yet to - be created by Client A. - 5 Client A finishes mkdir of /a on other subvol - 6 Client C also attempts to create /a/b, lookup returns ENOENT on - both subvols. - 7 Client C tries to obtain entrylk on on inode=/a with basename=b, - obtains on one subvol (where B had failed), and waits for B to unlock - on other subvol. - 8 Client B finishes mkdir() on one subvol with GFID-1 and completes - transaction and unlocks - 9 Client C gets the lock on the second subvol, At this stage second - subvol already has /a/b created from Client B, but Client C does not - check that in the middle of mkdir transaction - 10 Client C attempts mkdir /a/b on both subvols. It succeeds on - ONLY ONE (where Client B could not get lock because of - missing parent /a dir) with GFID-2, and gets EEXIST from ONE subvol. - This way we have /a/b in GFID mismatch. One subvol got GFID-1 because - Client B performed transaction on only one subvol (because entrylk() - could not be obtained on second subvol because of missing parent dir -- - caused by premature/speculative succeeding of lookup() on /a when locks - are detected). Other subvol gets GFID-2 from Client C because while - it was waiting for entrylk() on both subvols, Client B was in the - middle of creating mkdir() on only one subvol, and Client C does not - "expect" this when it is between lock() and pre-op()/op() phase of the - transaction. - */ - if (local->cont.lookup.parent_entrylk && local->enoent_count) - return _gf_true; - - return _gf_false; -} static void -afr_lookup_done (call_frame_t *frame, xlator_t *this) +afr_discover_done (call_frame_t *frame, xlator_t *this) { - int unwind = 1; afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; - gf_boolean_t sh_launched = _gf_false; - gf_boolean_t fail_conflict = _gf_false; - int gfid_miss_count = 0; - int enotconn_count = 0; - int up_children_count = 0; + int i = -1; + int op_errno = 0; + int read_subvol = 0; priv = this->private; local = frame->local; - if (afr_is_entry_possibly_under_creation (local, this)) { - local->op_ret = -1; - local->op_errno = ENOENT; - goto unwind; + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret == 0) + local->op_ret = 0; } - if (local->op_ret < 0) - goto unwind; + op_errno = afr_final_errno (frame->local, this->private); - if (local->cont.lookup.parent_entrylk && local->success_count > 1) - afr_succeed_lookup_on_latest_iatt (local, this); - - gfid_miss_count = afr_lookup_gfid_missing_count (local, this); - up_children_count = afr_up_children_count (local->child_up, - priv->child_count); - enotconn_count = priv->child_count - up_children_count; - if ((gfid_miss_count == local->success_count) && - (enotconn_count > 0)) { - local->op_ret = -1; - local->op_errno = EIO; - gf_log (this->name, GF_LOG_ERROR, "Failing lookup for %s, " - "LOOKUP on a file without gfid is not allowed when " - "some of the children are down", local->loc.path); - goto unwind; - } - - if ((gfid_miss_count == local->success_count) && - uuid_is_null (local->cont.lookup.gfid_req)) { - local->op_ret = -1; - local->op_errno = ENODATA; - gf_log (this->name, GF_LOG_ERROR, "%s: No gfid present", - local->loc.path); + if (local->op_ret < 0) { + local->op_errno = op_errno; + local->op_ret = -1; goto unwind; - } + } - if (gfid_miss_count && uuid_is_null (local->cont.lookup.gfid_req)) - fail_conflict = _gf_true; - ret = afr_lookup_done_success_action (frame, this, fail_conflict); - if (ret) - goto unwind; - uuid_copy (local->self_heal.sh_gfid_req, local->cont.lookup.gfid_req); + afr_replies_interpret (frame, this, local->inode); - afr_lookup_perform_self_heal (frame, this, &sh_launched); - if (sh_launched) { - unwind = 0; - goto unwind; - } + read_subvol = afr_data_subvol_get (local->inode, this, 0, 0); + if (read_subvol == -1) { + gf_log (this->name, GF_LOG_WARNING, "no read subvols for %s", + local->loc.path); - unwind: - if (unwind) { - AFR_STACK_UNWIND (lookup, frame, local->op_ret, - local->op_errno, local->cont.lookup.inode, - &local->cont.lookup.buf, - local->cont.lookup.xattr, - &local->cont.lookup.postparent); - } -} + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid || + local->replies[i].op_ret == -1) + continue; + read_subvol = i; + break; + } + } -/* - * During a lookup, some errors are more "important" than - * others in that they must be given higher priority while - * returning to the user. - * - * The hierarchy is ESTALE > EIO > ENOENT > others - */ -int32_t -afr_most_important_error(int32_t old_errno, int32_t new_errno, - gf_boolean_t eio) -{ - if (old_errno == ESTALE || new_errno == ESTALE) - return ESTALE; - if (eio && (old_errno == EIO || new_errno == EIO)) - return EIO; - if (old_errno == ENOENT || new_errno == ENOENT) - return ENOENT; +unwind: + if (read_subvol == -1) + read_subvol = 0; - return new_errno; + AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->replies[read_subvol].poststat, + local->replies[read_subvol].xdata, + &local->replies[read_subvol].postparent); } -int32_t -afr_resultant_errno_get (int32_t *children, - int *child_errno, unsigned int child_count) -{ - int i = 0; - int32_t op_errno = 0; - int child = 0; - - for (i = 0; i < child_count; i++) { - if (children) { - child = children[i]; - if (child == -1) - break; - } else { - child = i; - } - op_errno = afr_most_important_error(op_errno, - child_errno[child], - _gf_false); - } - return op_errno; -} -static void -afr_lookup_handle_error (afr_local_t *local, int32_t op_ret, int32_t op_errno) +int +afr_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *buf, + dict_t *xdata, struct iatt *postparent) { - GF_ASSERT (local); - if (op_errno == ENOENT) - local->enoent_count++; + afr_local_t * local = NULL; + int call_count = -1; + int child_index = -1; - local->op_errno = afr_most_important_error(local->op_errno, op_errno, - _gf_false); + child_index = (long) cookie; - if (local->op_errno == ESTALE) { - local->op_ret = -1; - } -} + local = frame->local; -static void -afr_set_root_inode_on_first_lookup (afr_local_t *local, xlator_t *this, - inode_t *inode) -{ - afr_private_t *priv = NULL; - GF_ASSERT (inode); + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (op_ret != -1) { + local->replies[child_index].poststat = *buf; + local->replies[child_index].postparent = *postparent; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + } - if (!__is_root_gfid (inode->gfid)) - goto out; - if (!afr_is_fresh_lookup (&local->loc, this)) - goto out; - priv = this->private; - if ((priv->first_lookup)) { - gf_log (this->name, GF_LOG_INFO, "added root inode"); - priv->root_inode = inode_ref (inode); - priv->first_lookup = 0; + if (local->do_discovery && (op_ret == 0)) + afr_attempt_local_discovery (this, child_index); + + call_count = afr_frame_return (frame); + if (call_count == 0) { + afr_discover_done (frame, this); } -out: - return; -} -static void -afr_lookup_cache_args (afr_local_t *local, int child_index, dict_t *xattr, - struct iatt *buf, struct iatt *postparent) -{ - GF_ASSERT (child_index >= 0); - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - local->cont.lookup.postparents[child_index] = *postparent; - local->cont.lookup.bufs[child_index] = *buf; + return 0; } -static void -afr_lookup_handle_first_success (afr_local_t *local, xlator_t *this, - inode_t *inode, struct iatt *buf) -{ - local->cont.lookup.inode = inode_ref (inode); - local->cont.lookup.buf = *buf; - afr_set_root_inode_on_first_lookup (local, this, inode); -} -static int32_t -afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, - dict_t *xdata) +int +afr_discover_do (call_frame_t *frame, xlator_t *this, int err) { - int ret = 0; - char *pathinfo = NULL; - gf_boolean_t is_local = _gf_false; - afr_private_t *priv = NULL; - int32_t child_index = -1; + int ret = 0; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; - if (op_ret != 0) { - goto out; - } + local = frame->local; + priv = this->private; - ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); - if (ret != 0) { - goto out; - } + if (err) { + local->op_errno = -err; + ret = -1; + goto out; + } - ret = afr_local_pathinfo (pathinfo, &is_local); + call_count = local->call_count = AFR_COUNT (local->child_up, + priv->child_count); + + ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, + &local->loc); if (ret) { + local->op_errno = -ret; + ret = -1; goto out; } - priv = this->private; - /* - * Note that one local subvolume will override another here. The only - * way to avoid that would be to retain extra information about whether - * the previous read_child is local, and it's just not worth it. Even - * the slowest local subvolume is far preferable to a remote one. - */ - if (is_local) { - child_index = (int32_t)(long)cookie; - gf_log (this->name, GF_LOG_INFO, - "selecting local read_child %s", - priv->children[child_index]->name); - priv->read_child = child_index; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_discover_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, local->xattr_req); + if (!--call_count) + break; + } } + return 0; out: - STACK_DESTROY(frame->root); - return 0; + AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0); + return 0; } -static void -afr_attempt_local_discovery (xlator_t *this, int32_t child_index) + +int +afr_discover (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) { - call_frame_t *newframe = NULL; - loc_t tmploc = {0,}; - afr_private_t *priv = this->private; + int op_errno = ENOMEM; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int event = 0; - newframe = create_frame(this,this->ctx->pool); - if (!newframe) { - return; - } + priv = this->private; - tmploc.gfid[sizeof(tmploc.gfid)-1] = 1; - STACK_WIND_COOKIE (newframe, afr_discovery_cbk, - (void *)(long)child_index, - priv->children[child_index], - priv->children[child_index]->fops->getxattr, - &tmploc, GF_XATTR_PATHINFO_KEY, NULL); -} - -static void -afr_lookup_handle_success (afr_local_t *local, xlator_t *this, int32_t child_index, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - afr_private_t *priv = this->private; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - if (local->success_count == 0) { - if (local->op_errno != ESTALE) { - local->op_ret = op_ret; - local->op_errno = 0; - } - afr_lookup_handle_first_success (local, this, inode, buf); + if (!local->call_count) { + op_errno = ENOTCONN; + goto out; } - afr_lookup_update_lk_counts (local, this, - child_index, xattr); - afr_lookup_cache_args (local, child_index, xattr, - buf, postparent); + if (__is_root_gfid (loc->inode->gfid)) { + if (!this->itable) + this->itable = loc->inode->table; + if (!priv->root_inode) + priv->root_inode = inode_ref (loc->inode); - if (local->do_discovery && (priv->read_child == (-1))) { - afr_attempt_local_discovery(this,child_index); - } + if (priv->choose_local && !priv->did_discovery) { + /* Logic to detect which subvolumes of AFR are + local, in order to prefer them for reads + */ + local->do_discovery = _gf_true; + priv->did_discovery = _gf_true; + } + } - local->cont.lookup.success_children[local->success_count] = child_index; - local->success_count++; -} + local->op = GF_FOP_LOOKUP; -int -afr_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - afr_local_t * local = NULL; - int call_count = -1; - int child_index = -1; + loc_copy (&local->loc, loc); - child_index = (long) cookie; + local->inode = inode_ref (loc->inode); - LOCK (&frame->lock); - { - local = frame->local; + if (xattr_req) + /* If xattr_req was null, afr_lookup_xattr_req_prepare() will + allocate one for us */ + local->xattr_req = dict_ref (xattr_req); - if (op_ret == -1) { - afr_lookup_handle_error (local, op_ret, op_errno); - goto unlock; - } - afr_lookup_handle_success (local, this, child_index, op_ret, - op_errno, inode, buf, xattr, - postparent); + if (uuid_is_null (loc->inode->gfid)) { + afr_discover_do (frame, this, 0); + return 0; + } - } -unlock: - UNLOCK (&frame->lock); + afr_read_subvol_get (loc->inode, this, NULL, &event, + AFR_DATA_TRANSACTION); - call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_lookup_done (frame, this); - } + if (event != local->event_generation) + afr_inode_refresh (frame, this, loc->inode, afr_discover_do); + else + afr_discover_do (frame, this, 0); - return 0; + return 0; +out: + AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; } + int -afr_lookup_cont_init (afr_local_t *local, unsigned int child_count) +afr_lookup_do (call_frame_t *frame, xlator_t *this, int err) { - int ret = -ENOMEM; - struct iatt *iatts = NULL; - int32_t *success_children = NULL; - int32_t *sources = NULL; - int32_t **pending_matrix = NULL; - - GF_ASSERT (local); - local->cont.lookup.xattrs = GF_CALLOC (child_count, - sizeof (*local->cont.lookup.xattr), - gf_afr_mt_dict_t); - if (NULL == local->cont.lookup.xattrs) - goto out; - - iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); - if (NULL == iatts) - goto out; - local->cont.lookup.postparents = iatts; + int ret = 0; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; - iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); - if (NULL == iatts) - goto out; - local->cont.lookup.bufs = iatts; + local = frame->local; + priv = this->private; - success_children = afr_children_create (child_count); - if (NULL == success_children) - goto out; - local->cont.lookup.success_children = success_children; + if (err < 0) { + local->op_errno = -err; + ret = -1; + goto out; + } - local->fresh_children = afr_children_create (child_count); - if (NULL == local->fresh_children) - goto out; + call_count = local->call_count = AFR_COUNT (local->child_up, + priv->child_count); - sources = GF_CALLOC (sizeof (*sources), child_count, gf_afr_mt_int32_t); - if (NULL == sources) - goto out; - local->cont.lookup.sources = sources; - - pending_matrix = afr_matrix_create (child_count, child_count); - if (NULL == pending_matrix) + ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, + &local->loc); + if (ret) { + local->op_errno = -ret; + ret = -1; goto out; - local->cont.lookup.pending_matrix = pending_matrix; + } - ret = 0; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, local->xattr_req); + if (!--call_count) + break; + } + } + return 0; out: - return ret; + AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0); + return 0; } +/* + * afr_lookup() + * + * The goal here is to figure out what the element getting looked up is. + * i.e what is the GFID, inode type and a conservative estimate of the + * inode attributes are. + * + * As we lookup, operations may be underway on the entry name and the + * inode. In lookup() we are primarily concerned only with the entry + * operations. If the entry is getting unlinked or renamed, we detect + * what operation is underway by querying for on-going transactions and + * pending self-healing on the entry through xdata. + * + * If the entry is a file/dir, it may need self-heal and/or in a + * split-brain condition. Lookup is not the place to worry about these + * conditions. Outcast marking will naturally handle them in the read + * paths. + * + * Here is a brief goal of what we are trying to achieve: + * + * - LOOKUP on all subvolumes concurrently, querying on-going transaction + * and pending self-heal info from the servers. + * + * - If all servers reply the same inode type and GFID, the overall call + * MUST be a success. + * + * - If inode types or GFIDs mismatch, and there IS either an on-going + * transaction or pending self-heal, inspect what the nature of the + * transaction or pending heal is, and select the appropriate subvolume's + * reply as the winner. + * + * - If inode types or GFIDs mismatch, and there are no on-going transactions + * or pending self-heal on the entry name on any of the servers, fail the + * lookup with EIO. Something has gone wrong beyond reasonable action. + */ + int -afr_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) +afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - void *gfid_req = NULL; - int ret = -1; - int i = 0; - int call_count = 0; - uint64_t ctx = 0; - int32_t op_errno = 0; - priv = this->private; - - AFR_LOCAL_ALLOC_OR_GOTO (local, out); + afr_local_t *local = NULL; + int32_t op_errno = 0; + int event = 0; - local->op_ret = -1; + if (!loc->parent) { + afr_discover (frame, this, loc, xattr_req); + return 0; + } - frame->local = local; - local->fop = GF_FOP_LOOKUP; + if (__is_root_gfid (loc->parent->gfid)) { + if (!strcmp (loc->name, GF_REPLICATE_TRASH_DIR)) { + op_errno = EPERM; + goto out; + } + } - loc_copy (&local->loc, loc); - ret = loc_path (&local->loc, NULL); - if (ret < 0) { - op_errno = EINVAL; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - if (local->loc.path && - (strcmp (local->loc.path, "/" GF_REPLICATE_TRASH_DIR) == 0)) { - op_errno = EPERM; - ret = -1; + if (!local->call_count) { + op_errno = ENOTCONN; goto out; } - ret = inode_ctx_get (local->loc.inode, this, &ctx); - if (ret == 0) { - /* lookup is a revalidate */ + local->op = GF_FOP_LOOKUP; - local->read_child_index = afr_inode_get_read_ctx (this, - local->loc.inode, - NULL); - } else { - LOCK (&priv->read_child_lock); - { - if (priv->hash_mode) { - local->read_child_index = -1; - } - else { - local->read_child_index = - (++priv->read_child_rr) % - (priv->child_count); - } - } - UNLOCK (&priv->read_child_lock); - local->cont.lookup.fresh_lookup = _gf_true; - } + loc_copy (&local->loc, loc); - local->child_up = memdup (priv->child_up, - sizeof (*local->child_up) * priv->child_count); - if (NULL == local->child_up) { - op_errno = ENOMEM; - goto out; - } + local->inode = inode_ref (loc->inode); - ret = afr_lookup_cont_init (local, priv->child_count); - if (ret < 0) { - op_errno = -ret; - goto out; - } + if (xattr_req) + /* If xattr_req was null, afr_lookup_xattr_req_prepare() will + allocate one for us */ + local->xattr_req = dict_ref (xattr_req); - local->call_count = afr_up_children_count (local->child_up, - priv->child_count); - call_count = local->call_count; - if (local->call_count == 0) { - ret = -1; - op_errno = ENOTCONN; - goto out; - } + afr_read_subvol_get (loc->parent, this, NULL, &event, + AFR_DATA_TRANSACTION); - /* By default assume ENOTCONN. On success it will be set to 0. */ - local->op_errno = ENOTCONN; + if (event != local->event_generation) + afr_inode_refresh (frame, this, loc->parent, afr_lookup_do); + else + afr_lookup_do (frame, this, 0); - ret = dict_get_int32 (xattr_req, "attempt-self-heal", - &local->attempt_self_heal); - dict_del (xattr_req, "attempt-self-heal"); + return 0; +out: + AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); - ret = dict_get_int32 (xattr_req, "foreground-self-heal", - &local->foreground_self_heal); - dict_del (xattr_req, "foreground-self-heal"); + return 0; +} - ret = afr_lookup_xattr_req_prepare (local, this, xattr_req, &local->loc, - &gfid_req); - if (ret) { - local->op_errno = -ret; - goto out; - } - afr_lookup_save_gfid (local->cont.lookup.gfid_req, gfid_req, - &local->loc); - local->fop = GF_FOP_LOOKUP; - if (priv->choose_local && !priv->did_discovery) { - if (gfid_req && __is_root_gfid(gfid_req)) { - local->do_discovery = _gf_true; - priv->did_discovery = _gf_true; - } - } - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_lookup_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - &local->loc, local->xattr_req); - if (!--call_count) - break; - } + +/* {{{ open */ + +afr_fd_ctx_t * +__afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ + uint64_t ctx = 0; + int ret = 0; + afr_fd_ctx_t *fd_ctx = NULL; + + ret = __fd_ctx_get (fd, this, &ctx); + + if (ret < 0) { + ret = __afr_fd_ctx_set (this, fd); + if (ret < 0) + goto out; + + ret = __fd_ctx_get (fd, this, &ctx); + if (ret < 0) + goto out; } - ret = 0; + fd_ctx = (afr_fd_ctx_t *)(long) ctx; out: - if (ret) - AFR_STACK_UNWIND (lookup, frame, -1, op_errno, - NULL, NULL, NULL, NULL); - - return 0; + return fd_ctx; } -/* {{{ open */ +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ + afr_fd_ctx_t *fd_ctx = NULL; + + LOCK(&fd->lock); + { + fd_ctx = __afr_fd_ctx_get (fd, this); + } + UNLOCK(&fd->lock); + + return fd_ctx; +} + int __afr_fd_ctx_set (xlator_t *this, fd_t *fd) @@ -2559,6 +1911,7 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) int ret = -1; uint64_t ctx = 0; afr_fd_ctx_t * fd_ctx = NULL; + int i = 0; VALIDATE_OR_GOTO (this->private, out); VALIDATE_OR_GOTO (fd, out); @@ -2577,21 +1930,15 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) goto out; } - fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_done) { - ret = -ENOMEM; - goto out; - } - - fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_piggyback) { - ret = -ENOMEM; - goto out; - } + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]), + priv->child_count, + gf_afr_mt_int32_t); + if (!fd_ctx->pre_op_done[i]) { + ret = -ENOMEM; + goto out; + } + } fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), priv->child_count, @@ -2601,6 +1948,13 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) goto out; } + for (i = 0; i < priv->child_count; i++) { + if (fd_is_anonymous (fd)) + fd_ctx->opened_on[i] = AFR_FD_OPENED; + else + fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED; + } + fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback), priv->child_count, gf_afr_mt_char); @@ -2617,20 +1971,7 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) goto out; } - fd_ctx->up_count = priv->up_count; - fd_ctx->down_count = priv->down_count; - - fd_ctx->locked_on = GF_CALLOC (sizeof (*fd_ctx->locked_on), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->locked_on) { - ret = -ENOMEM; - goto out; - } - pthread_mutex_init (&fd_ctx->delay_lock, NULL); - INIT_LIST_HEAD (&fd_ctx->entries); - fd_ctx->call_child = -1; INIT_LIST_HEAD (&fd_ctx->eager_locked); @@ -2660,32 +2001,31 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd) /* {{{ flush */ int -afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - int call_count = -1; + afr_local_t *local = NULL; + int call_count = -1; local = frame->local; LOCK (&frame->lock); { if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - } - - local->op_errno = op_errno; + local->op_ret = op_ret; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + } else { + local->op_errno = op_errno; + } } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) - AFR_STACK_UNWIND(flush, frame, local->op_ret, - local->op_errno, NULL); + AFR_STACK_UNWIND (flush, frame, local->op_ret, + local->op_errno, local->xdata_rsp); return 0; } @@ -2708,7 +2048,7 @@ afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) (void *) (long) i, priv->children[i], priv->children[i]->fops->flush, - local->fd, NULL); + local->fd, xdata); if (!--call_count) break; @@ -2721,40 +2061,30 @@ afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) int afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_private_t *priv = NULL; afr_local_t *local = NULL; call_stub_t *stub = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; - priv = this->private; - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - ret = afr_local_init(local, priv, &op_errno); - if (ret < 0) + if (!local->call_count) { + op_errno = ENOTCONN; goto out; + } local->fd = fd_ref(fd); + stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata); - if (!stub) { - ret = -1; - op_errno = ENOMEM; + if (!stub) goto out; - } afr_delayed_changelog_wake_resume (this, fd, stub); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND(flush, frame, -1, op_errno, NULL); - + AFR_STACK_UNWIND (flush, frame, -1, op_errno, NULL); return 0; } @@ -2767,6 +2097,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) uint64_t ctx = 0; afr_fd_ctx_t *fd_ctx = NULL; int ret = 0; + int i = 0; ret = fd_ctx_get (fd, this, &ctx); if (ret < 0) @@ -2775,13 +2106,11 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) fd_ctx = (afr_fd_ctx_t *)(long) ctx; if (fd_ctx) { - GF_FREE (fd_ctx->pre_op_done); + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) + GF_FREE (fd_ctx->pre_op_done[i]); GF_FREE (fd_ctx->opened_on); - GF_FREE (fd_ctx->locked_on); - - GF_FREE (fd_ctx->pre_op_piggyback); GF_FREE (fd_ctx->lock_piggyback); GF_FREE (fd_ctx->lock_acquired); @@ -2799,24 +2128,8 @@ out: int afr_release (xlator_t *this, fd_t *fd) { - afr_locked_fd_t *locked_fd = NULL; - afr_locked_fd_t *tmp = NULL; - afr_private_t *priv = NULL; - - priv = this->private; - afr_cleanup_fd_ctx (this, fd); - list_for_each_entry_safe (locked_fd, tmp, &priv->saved_fds, - list) { - - if (locked_fd->fd == fd) { - list_del_init (&locked_fd->list); - GF_FREE (locked_fd); - } - - } - return 0; } @@ -2841,36 +2154,38 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_local_t *local = NULL; int call_count = -1; int child_index = (long) cookie; - int read_child = 0; + int read_subvol = 0; call_stub_t *stub = NULL; local = frame->local; - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + read_subvol = afr_data_subvol_get (local->inode, this, 0, 0); LOCK (&frame->lock); { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - if (op_ret == 0) { - local->op_ret = 0; + if (local->op_ret == -1) { + local->op_ret = 0; - if (local->success_count == 0) { local->cont.inode_wfop.prebuf = *prebuf; local->cont.inode_wfop.postbuf = *postbuf; + + if (xdata) + local->xdata_rsp = dict_ref (xdata); } - if (child_index == read_child) { + if (child_index == read_subvol) { local->cont.inode_wfop.prebuf = *prebuf; local->cont.inode_wfop.postbuf = *postbuf; + if (xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = dict_ref (xdata); + } } - - local->success_count++; - } - - local->op_errno = op_errno; + } else { + local->op_errno = op_errno; + } } UNLOCK (&frame->lock); @@ -2890,7 +2205,7 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_ret, local->op_errno, &local->cont.inode_wfop.prebuf, &local->cont.inode_wfop.postbuf, - xdata); + local->xdata_rsp); if (!stub) { AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); return 0; @@ -2910,37 +2225,35 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int -afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync, dict_t *xdata) +afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; + int32_t op_errno = ENOMEM; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + priv = this->private; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } - local->fd = fd_ref (fd); + local->fd = fd_ref (fd); if (afr_fd_has_witnessed_unstable_write (this, fd)) { /* don't care. we only wanted to CLEAR the bit */ } + local->inode = inode_ref (fd->inode); + for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, afr_fsync_cbk, @@ -2953,10 +2266,10 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); + AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } @@ -2964,10 +2277,9 @@ out: /* {{{ fsync */ -int32_t -afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xdata) +int +afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; int call_count = -1; @@ -2976,10 +2288,13 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, LOCK (&frame->lock); { - if (op_ret == 0) + if (op_ret == 0) { local->op_ret = 0; - - local->op_errno = op_errno; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + } else { + local->op_errno = op_errno; + } } UNLOCK (&frame->lock); @@ -2987,37 +2302,33 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret, - local->op_errno, xdata); + local->op_errno, local->xdata_rsp); return 0; } -int32_t -afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync, dict_t *xdata) +int +afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; + int32_t op_errno = ENOMEM; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + priv = this->private; - priv = this->private; - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3030,10 +2341,10 @@ afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); + return 0; } @@ -3056,6 +2367,10 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, if (op_ret == 0) { if (!local->cont.xattrop.xattr) local->cont.xattrop.xattr = dict_ref (xattr); + + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + local->op_ret = 0; } @@ -3067,7 +2382,7 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (xattrop, frame, local->op_ret, local->op_errno, - local->cont.xattrop.xattr, xdata); + local->cont.xattrop.xattr, local->xdata_rsp); return 0; } @@ -3079,25 +2394,21 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3110,10 +2421,10 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); + return 0; } @@ -3138,6 +2449,8 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, if (!local->cont.fxattrop.xattr) local->cont.fxattrop.xattr = dict_ref (xattr); + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); local->op_ret = 0; } @@ -3149,7 +2462,7 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fxattrop, frame, local->op_ret, local->op_errno, - local->cont.fxattrop.xattr, xdata); + local->cont.fxattrop.xattr, local->xdata_rsp); return 0; } @@ -3161,25 +2474,21 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; int32_t op_errno = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3192,10 +2501,10 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); + return 0; } @@ -3203,8 +2512,8 @@ out: int32_t -afr_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -3238,25 +2547,21 @@ afr_inodelk (call_frame_t *frame, xlator_t *this, { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOMEM; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3270,18 +2575,17 @@ afr_inodelk (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); + return 0; } int32_t -afr_finodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xdata) +afr_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -3309,31 +2613,26 @@ afr_finodelk_cbk (call_frame_t *frame, void *cookie, int32_t -afr_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, - dict_t *xdata) +afr_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *flock, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3347,10 +2646,10 @@ afr_finodelk (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); + return 0; } @@ -3383,33 +2682,28 @@ afr_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } -int32_t -afr_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, - const char *basename, entrylk_cmd cmd, entrylk_type type, - dict_t *xdata) +int +afr_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; int32_t op_errno = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3423,18 +2717,18 @@ afr_entrylk (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); + return 0; } -int32_t -afr_fentrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) +int +afr_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -3461,33 +2755,28 @@ afr_fentrylk_cbk (call_frame_t *frame, void *cookie, } -int32_t -afr_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, - const char *basename, entrylk_cmd cmd, - entrylk_type type, dict_t *xdata) +int +afr_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3501,82 +2790,85 @@ afr_fentrylk (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); + return 0; } -int32_t -afr_statfs_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct statvfs *statvfs, dict_t *xdata) + +int +afr_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct statvfs *statvfs, dict_t *xdata) { afr_local_t *local = NULL; int call_count = 0; + struct statvfs *buf = NULL; LOCK (&frame->lock); { local = frame->local; - if (op_ret == 0) { - local->op_ret = op_ret; - - if (local->cont.statfs.buf_set) { - if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail) - local->cont.statfs.buf = *statvfs; - } else { - local->cont.statfs.buf = *statvfs; - local->cont.statfs.buf_set = 1; - } - } - - if (op_ret == -1) + if (op_ret != 0) { local->op_errno = op_errno; + goto unlock; + } + local->op_ret = op_ret; + + buf = &local->cont.statfs.buf; + if (local->cont.statfs.buf_set) { + if (statvfs->f_bavail < buf->f_bavail) { + *buf = *statvfs; + if (xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = dict_ref (xdata); + } + } + } else { + *buf = *statvfs; + local->cont.statfs.buf_set = 1; + if (xdata) + local->xdata_rsp = dict_ref (xdata); + } } +unlock: UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno, - &local->cont.statfs.buf, xdata); + &local->cont.statfs.buf, local->xdata_rsp); return 0; } -int32_t -afr_statfs (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xdata) +int +afr_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - afr_private_t * priv = NULL; - int child_count = 0; afr_local_t * local = NULL; + afr_private_t *priv = NULL; int i = 0; - int ret = -1; int call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - child_count = priv->child_count; + int32_t op_errno = ENOMEM; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + priv = this->private; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } - for (i = 0; i < child_count; i++) { + for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_statfs_cbk, priv->children[i], @@ -3587,10 +2879,10 @@ afr_statfs (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); + return 0; } @@ -3699,21 +2991,6 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN, &local->cont.lk.ret_flock, NULL); } else { - /* locking has succeeded on all nodes that are up */ - - /* temporarily - ret = afr_mark_locked_nodes (this, local->fd, - local->cont.lk.locked_nodes); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "Could not save locked nodes info in fdctx"); - - ret = afr_save_locked_fd (this, local->fd); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "Could not save locked fd"); - - */ AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, &local->cont.lk.ret_flock, NULL); } @@ -3729,20 +3006,12 @@ afr_lk (call_frame_t *frame, xlator_t *this, afr_private_t *priv = NULL; afr_local_t *local = NULL; int i = 0; - int32_t op_errno = 0; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) goto out; local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count, @@ -3764,28 +3033,16 @@ afr_lk (call_frame_t *frame, xlator_t *this, priv->children[i]->fops->lk, fd, cmd, flock, xdata); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); + return 0; } int afr_forget (xlator_t *this, inode_t *inode) { - uint64_t ctx_addr = 0; - afr_inode_ctx_t *ctx = NULL; - - inode_ctx_get (inode, this, &ctx_addr); - - if (!ctx_addr) - goto out; - - ctx = (afr_inode_ctx_t *)(long)ctx_addr; - GF_FREE (ctx->fresh_children); - GF_FREE (ctx); -out: return 0; } @@ -3805,7 +3062,6 @@ afr_priv_dump (xlator_t *this) snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); gf_proc_dump_add_section(key_prefix); gf_proc_dump_write("child_count", "%u", priv->child_count); - gf_proc_dump_write("read_child_rr", "%u", priv->read_child_rr); for (i = 0; i < priv->child_count; i++) { sprintf (key, "child_up[%d]", i); gf_proc_dump_write(key, "%d", priv->child_up[i]); @@ -3862,7 +3118,7 @@ afr_notify (xlator_t *this, int32_t event, int idx = -1; int ret = -1; int call_psh = 0; - int up_child = AFR_ALL_CHILDREN; + int up_child = -1; dict_t *input = NULL; dict_t *output = NULL; @@ -3914,6 +3170,7 @@ afr_notify (xlator_t *this, int32_t event, */ if (priv->child_up[idx] != 1) { priv->up_count++; + priv->event_generation++; } priv->child_up[idx] = 1; @@ -3953,6 +3210,7 @@ afr_notify (xlator_t *this, int32_t event, */ if (priv->child_up[idx] == 1) { priv->down_count++; + priv->event_generation++; } priv->child_up[idx] = 0; @@ -4019,8 +3277,7 @@ afr_notify (xlator_t *this, int32_t event, LOCK (&priv->lock); { - up_children = afr_up_children_count (priv->child_up, - priv->child_count); + up_children = AFR_COUNT (priv->child_up, priv->child_count); for (i = 0; i < priv->child_count; i++) { if (priv->last_event[i] == GF_EVENT_CHILD_UP) { event = GF_EVENT_CHILD_UP; @@ -4040,39 +3297,23 @@ afr_notify (xlator_t *this, int32_t event, ret = 0; if (propagate) ret = default_notify (this, event, data); - if (call_psh && priv->shd.iamshd) - afr_proactive_self_heal ((void*) (long) up_child); + if (call_psh && priv->shd.iamshd) { + afr_selfheal_childup (this, up_child); + } out: return ret; } -int -afr_first_up_child (unsigned char *child_up, size_t child_count) -{ - int ret = -1; - int i = 0; - - GF_ASSERT (child_up); - - for (i = 0; i < child_count; i++) { - if (child_up[i]) { - ret = i; - break; - } - } - - return ret; -} int afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) { - int ret = -1; - local->op_ret = -1; local->op_errno = EUCLEAN; + syncbarrier_init (&local->barrier); + local->child_up = GF_CALLOC (priv->child_count, sizeof (*local->child_up), gf_afr_mt_char); @@ -4084,38 +3325,42 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) memcpy (local->child_up, priv->child_up, sizeof (*local->child_up) * priv->child_count); - local->call_count = afr_up_children_count (local->child_up, - priv->child_count); + local->call_count = AFR_COUNT (local->child_up, priv->child_count); if (local->call_count == 0) { gf_log (THIS->name, GF_LOG_INFO, "no subvolumes up"); if (op_errno) *op_errno = ENOTCONN; goto out; } + local->event_generation = priv->event_generation; - local->child_errno = GF_CALLOC (priv->child_count, - sizeof (*local->child_errno), - gf_afr_mt_int32_t); - if (!local->child_errno) { - if (op_errno) - *op_errno = ENOMEM; - goto out; - } + local->read_attempted = GF_CALLOC (priv->child_count, sizeof (char), + gf_afr_mt_char); + if (!local->read_attempted) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } - local->transaction.postop_piggybacked = GF_CALLOC (priv->child_count, - sizeof (int), - gf_afr_mt_int32_t); - if (!local->transaction.postop_piggybacked) { - if (op_errno) - *op_errno = ENOMEM; - goto out; - } + local->readable = GF_CALLOC (priv->child_count, sizeof (char), + gf_afr_mt_char); + if (!local->readable) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } - local->append_write = _gf_false; + local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), + gf_afr_mt_reply_t); + if (!local->replies) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } - ret = 0; + return 0; out: - return ret; + return -1; } int @@ -4218,13 +3463,11 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) } ret = -ENOMEM; - child_up_count = afr_up_children_count (local->child_up, - priv->child_count); + child_up_count = AFR_COUNT (local->child_up, priv->child_count); if (priv->optimistic_change_log && child_up_count == priv->child_count) local->optimistic_change_log = 1; - local->first_up_child = afr_first_up_child (local->child_up, - priv->child_count); + local->pre_op_compat = priv->pre_op_compat; local->transaction.eager_lock = GF_CALLOC (sizeof (*local->transaction.eager_lock), @@ -4234,26 +3477,29 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) if (!local->transaction.eager_lock) goto out; - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) - goto out; - local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op), priv->child_count, gf_afr_mt_char); if (!local->transaction.pre_op) goto out; + local->transaction.fop_subvols = GF_CALLOC (sizeof (*local->transaction.fop_subvols), + priv->child_count, + gf_afr_mt_char); + if (!local->transaction.fop_subvols) + goto out; + + local->transaction.failed_subvols = GF_CALLOC (sizeof (*local->transaction.failed_subvols), + priv->child_count, + gf_afr_mt_char); + if (!local->transaction.failed_subvols) + goto out; + local->pending = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); if (!local->pending) goto out; - local->transaction.txn_changelog = afr_matrix_create (priv->child_count, - AFR_NUM_CHANGE_LOGS); - if (!local->transaction.txn_changelog) - goto out; - INIT_LIST_HEAD (&local->transaction.eager_locked); ret = 0; @@ -4261,86 +3507,6 @@ out: return ret; } -void -afr_reset_children (int32_t *fresh_children, int32_t child_count) -{ - unsigned int i = 0; - for (i = 0; i < child_count; i++) - fresh_children[i] = -1; -} - -int32_t* -afr_children_create (int32_t child_count) -{ - int32_t *children = NULL; - int i = 0; - - GF_ASSERT (child_count > 0); - - children = GF_CALLOC (child_count, sizeof (*children), - gf_afr_mt_int32_t); - if (NULL == children) - goto out; - for (i = 0; i < child_count; i++) - children[i] = -1; -out: - return children; -} - -void -afr_children_add_child (int32_t *children, int32_t child, - int32_t child_count) -{ - gf_boolean_t child_found = _gf_false; - int i = 0; - - for (i = 0; i < child_count; i++) { - if (children[i] == -1) - break; - if (children[i] == child) { - child_found = _gf_true; - break; - } - } - - if (!child_found) { - GF_ASSERT (i < child_count); - children[i] = child; - } -} - -void -afr_children_rm_child (int32_t *children, int32_t child, int32_t child_count) -{ - int i = 0; - - GF_ASSERT ((child >= 0) && (child < child_count)); - for (i = 0; i < child_count; i++) { - if (children[i] == -1) - break; - if (children[i] == child) { - if (i != (child_count - 1)) - memmove (children + i, children + i + 1, - sizeof (*children)*(child_count - i - 1)); - children[child_count - 1] = -1; - break; - } - } -} - -int -afr_get_children_count (int32_t *children, unsigned int child_count) -{ - int count = 0; - int i = 0; - - for (i = 0; i < child_count; i++) { - if (children[i] == -1) - break; - count++; - } - return count; -} void afr_set_low_priority (call_frame_t *frame) @@ -4348,38 +3514,6 @@ afr_set_low_priority (call_frame_t *frame) frame->root->pid = LOW_PRIO_PROC_PID; } -int -afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child, - int flags) -{ - int ret = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - GF_ASSERT (fd && fd->inode); - ret = afr_fd_ctx_set (this, fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not set fd ctx for fd=%p", fd); - goto out; - } - - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not get fd ctx for fd=%p", fd); - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - fd_ctx->opened_on[child] = AFR_FD_OPENED; - if (!IA_ISDIR (fd->inode->ia_type)) { - fd_ctx->flags = flags; - } - ret = 0; -out: - return ret; -} gf_boolean_t afr_have_quorum (char *logname, afr_private_t *priv) @@ -4426,33 +3560,6 @@ afr_priv_destroy (afr_private_t *priv) if (!priv) goto out; inode_unref (priv->root_inode); - GF_FREE (priv->shd.pos); - GF_FREE (priv->shd.pending); - GF_FREE (priv->shd.inprogress); -// for (i = 0; i < priv->child_count; i++) -// if (priv->shd.timer && priv->shd.timer[i]) -// gf_timer_call_cancel (this->ctx, priv->shd.timer[i]); - GF_FREE (priv->shd.timer); - - if (priv->shd.healed) - eh_destroy (priv->shd.healed); - - if (priv->shd.heal_failed) - eh_destroy (priv->shd.heal_failed); - - if (priv->shd.split_brain) - eh_destroy (priv->shd.split_brain); - - for (i = 0; i < priv->child_count; i++) - { - if (priv->shd.statistics[i]) - eh_destroy (priv->shd.statistics[i]); - } - - GF_FREE (priv->shd.statistics); - - GF_FREE (priv->shd.crawl_events); - GF_FREE (priv->last_event); if (priv->pending_key) { for (i = 0; i < priv->child_count; i++) @@ -4462,8 +3569,7 @@ afr_priv_destroy (afr_private_t *priv) GF_FREE (priv->children); GF_FREE (priv->child_up); LOCK_DESTROY (&priv->lock); - LOCK_DESTROY (&priv->read_child_lock); - pthread_mutex_destroy (&priv->mutex); + GF_FREE (priv); out: return; @@ -4480,124 +3586,21 @@ xlator_subvolume_count (xlator_t *this) return i; } -inline gf_boolean_t -afr_is_errno_set (int *child_errno, int child) -{ - return child_errno[child]; -} - -inline gf_boolean_t -afr_is_errno_unset (int *child_errno, int child) -{ - return !afr_is_errno_set (child_errno, child); -} - -void -afr_prepare_new_entry_pending_matrix (int32_t **pending, - gf_boolean_t (*is_pending) (int *, int), - int *ctx, struct iatt *buf, - unsigned int child_count) -{ - int midx = 0; - int idx = 0; - int i = 0; - - midx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); - if (IA_ISDIR (buf->ia_type)) - idx = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); - else if (IA_ISREG (buf->ia_type)) - idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - else - idx = -1; - for (i = 0; i < child_count; i++) { - if (is_pending (ctx, i)) { - pending[i][midx] = hton32 (1); - if (idx == -1) - continue; - pending[i][idx] = hton32 (1); - } - } -} - -gf_boolean_t -afr_is_fd_fixable (fd_t *fd) -{ - if (!fd || !fd->inode) - return _gf_false; - else if (fd_is_anonymous (fd)) - return _gf_false; - else if (uuid_is_null (fd->inode->gfid)) - return _gf_false; - - return _gf_true; -} void afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; - inode_t *inode = NULL; - afr_inode_ctx_t *ctx = NULL; + afr_fd_ctx_t *fd_ctx = NULL; local = frame->local; - if (local->fd) - inode = local->fd->inode; - else - inode = local->loc.inode; - - if (!inode) - return; - - LOCK (&inode->lock); - { - ctx = __afr_inode_ctx_get (inode, this); - ctx->open_fd_count = local->open_fd_count; - } - UNLOCK (&inode->lock); -} - -int -afr_initialise_statistics (xlator_t *this) -{ - afr_private_t *priv = NULL; - int ret = -1; - int i = 0; - int child_count = 0; - eh_t *stats_per_brick = NULL; - shd_crawl_event_t ***shd_crawl_events = NULL; - priv = this->private; - - priv->shd.statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count, - gf_common_mt_eh_t); - if (!priv->shd.statistics) { - ret = -1; - goto out; - } - child_count = priv->child_count; - for (i=0; i < child_count ; i++) { - stats_per_brick = eh_new (AFR_STATISTICS_HISTORY_SIZE, - _gf_false, - _destroy_crawl_event_data); - if (!stats_per_brick) { - ret = -1; - goto out; - } - priv->shd.statistics[i] = stats_per_brick; - - } - - shd_crawl_events = (shd_crawl_event_t***)(&priv->shd.crawl_events); - *shd_crawl_events = GF_CALLOC (sizeof(shd_crawl_event_t*), - priv->child_count, - gf_afr_mt_shd_crawl_event_t); + if (!local->fd) + return; - if (!priv->shd.crawl_events) { - ret = -1; - goto out; - } - ret = 0; -out: - return ret; + fd_ctx = afr_fd_ctx_get (local->fd, this); + if (!fd_ctx) + return; + fd_ctx->open_fd_count = local->open_fd_count; } diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index 689dd84e646..fa1da3958df 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -37,177 +37,7 @@ #include "checksum.h" #include "afr.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" - -int -afr_examine_dir_sh_unwind (call_frame_t *frame, xlator_t *this, int32_t op_ret, - int32_t op_errno, int32_t sh_failed) -{ - afr_local_t *local = NULL; - - local = frame->local; - - afr_set_opendir_done (this, local->fd->inode); - - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd, NULL); - - return 0; -} - - -gf_boolean_t -__checksums_differ (uint32_t *checksum, int child_count, - unsigned char *child_up) -{ - int ret = _gf_false; - int i = 0; - uint32_t cksum = 0; - gf_boolean_t activate_check = _gf_false; - - for (i = 0; i < child_count; i++) { - if (!child_up[i]) - continue; - if (_gf_false == activate_check) { - cksum = checksum[i]; - activate_check = _gf_true; - continue; - } - - if (cksum != checksum[i]) { - ret = _gf_true; - break; - } - - cksum = checksum[i]; - } - - return ret; -} - - -int32_t -afr_examine_dir_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; - char *reason = NULL; - int child_index = 0; - uint32_t entry_cksum = 0; - int call_count = 0; - off_t last_offset = 0; - inode_t *inode = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - inode = local->fd->inode; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to do opendir on %s", - local->loc.path, priv->children[child_index]->name); - local->op_ret = -1; - local->op_ret = op_errno; - goto out; - } - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: no entries found in %s", - local->loc.path, priv->children[child_index]->name); - goto out; - } - - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - entry_cksum = gf_rsync_weak_checksum ((unsigned char *)entry->d_name, - strlen (entry->d_name)); - local->cont.opendir.checksum[child_index] ^= entry_cksum; - } - - list_for_each_entry (entry, &entries->list, list) { - last_offset = entry->d_off; - } - - /* read more entries */ - - STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->readdir, - local->fd, 131072, last_offset, NULL); - - return 0; - -out: - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (__checksums_differ (local->cont.opendir.checksum, - priv->child_count, - local->child_up)) { - - sh->do_entry_self_heal = _gf_true; - sh->forced_merge = _gf_true; - - reason = "checksums of directory differ"; - afr_launch_self_heal (frame, this, inode, _gf_false, - inode->ia_type, reason, NULL, - afr_examine_dir_sh_unwind); - } else { - afr_set_opendir_done (this, inode); - - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd, NULL); - } - } - - return 0; -} - - -int -afr_examine_dir (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - int i = 0; - int call_count = 0; - - local = frame->local; - priv = this->private; - - local->cont.opendir.checksum = GF_CALLOC (priv->child_count, - sizeof (*local->cont.opendir.checksum), - gf_afr_mt_int32_t); - - call_count = afr_up_children_count (local->child_up, priv->child_count); - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->readdir, - local->fd, 131072, 0, NULL); - - if (!--call_count) - break; - } - } - - return 0; -} +#include "afr-transaction.h" int32_t @@ -215,112 +45,66 @@ afr_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - afr_private_t *priv = NULL; afr_local_t *local = NULL; - int32_t up_children_count = 0; - int ret = -1; int call_count = -1; int32_t child_index = 0; + afr_fd_ctx_t *fd_ctx = NULL; - priv = this->private; local = frame->local; + fd_ctx = local->fd_ctx; child_index = (long) cookie; - up_children_count = afr_up_children_count (local->child_up, - priv->child_count); - LOCK (&frame->lock); { - if (op_ret >= 0) { + if (op_ret == -1) { + local->op_errno = op_errno; + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } else { local->op_ret = op_ret; - ret = afr_child_fd_ctx_set (this, fd, child_index, 0); - if (ret) { - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); } - - local->op_errno = op_errno; } -unlock: UNLOCK (&frame->lock); call_count = afr_frame_return (frame); - if (call_count == 0) { - if (local->op_ret != 0) - goto out; - - if (!afr_is_opendir_done (this, local->fd->inode) && - up_children_count > 1 && priv->entry_self_heal) { - - /* - * This is the first opendir on this inode. We need - * to check if the directory's entries are the same - * on all subvolumes. This is needed in addition - * to regular entry self-heal because the readdir - * call is sent only to the first subvolume, and - * thus files that exist only there will never be healed - * otherwise (assuming changelog shows no anomalies). - */ - - gf_log (this->name, GF_LOG_TRACE, - "reading contents of directory %s looking for mismatch", - local->loc.path); - - afr_examine_dir (frame, this); - - } else { - /* do the unwind */ - goto out; - } - } - - return 0; - -out: - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd, NULL); - + if (call_count == 0) + AFR_STACK_UNWIND (opendir, frame, local->op_ret, + local->op_errno, local->fd, NULL); return 0; } -int32_t -afr_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd) +int +afr_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) { afr_private_t * priv = NULL; afr_local_t * local = NULL; - int child_count = 0; int i = 0; - int ret = -1; int call_count = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; + afr_fd_ctx_t *fd_ctx = NULL; priv = this->private; - child_count = priv->child_count; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + goto out; loc_copy (&local->loc, loc); local->fd = fd_ref (fd); + local->fd_ctx = fd_ctx; call_count = local->call_count; - for (i = 0; i < child_count; i++) { + for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, afr_opendir_cbk, (void*) (long) i, @@ -333,182 +117,280 @@ afr_opendir (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL); - + AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL); return 0; } -/** - * Common algorithm for directory read calls: - * - * - Try the fop on the first child that is up - * - if we have failed due to ENOTCONN: - * try the next child - * - * Applicable to: readdir - */ +#define BACKEND_D_OFF_BITS 63 +#define PRESENT_D_OFF_BITS 63 +#define ONE 1ULL +#define MASK (~0ULL) +#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS)) +#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS)) -struct entry_name { - char *name; - struct list_head list; -}; +#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1)) +#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1))) -static void -afr_forget_entries (fd_t *fd) +static uint64_t +afr_bits_for (uint64_t num) { - struct entry_name *entry = NULL; - struct entry_name *tmp = NULL; - int ret = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - ret = fd_ctx_get (fd, THIS, &ctx); - if (ret < 0) { - gf_log (THIS->name, GF_LOG_INFO, - "could not get fd ctx for fd=%p", fd); - return; - } + uint64_t bits = 0, ctrl = 1; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + while (ctrl < num) { + ctrl *= 2; + bits ++; + } - list_for_each_entry_safe (entry, tmp, &fd_ctx->entries, list) { - GF_FREE (entry->name); - list_del (&entry->list); - GF_FREE (entry); - } + return bits; } -static void -afr_readdir_filter_trash_dir (gf_dirent_t *entries, fd_t *fd) +int +afr_itransform (xlator_t *this, int subvol, uint64_t x, uint64_t *y_p) { - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; + afr_private_t *conf = NULL; + int cnt = 0; + int max = 0; + uint64_t y = 0; + uint64_t hi_mask = 0; + uint64_t off_mask = 0; + int max_bits = 0; + + if (x == ((uint64_t) -1)) { + y = (uint64_t) -1; + goto out; + } - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - if (__is_root_gfid (fd->inode->gfid) && - !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { - list_del_init (&entry->list); - GF_FREE (entry); - } + conf = this->private; + if (!conf) + goto out; + + max = conf->child_count; + cnt = subvol; + + if (max == 1) { + y = x; + goto out; + } + + max_bits = afr_bits_for (max); + + hi_mask = ~(PRESENT_MASK >> (max_bits + 1)); + + if (x & hi_mask) { + /* HUGE d_off */ + off_mask = MASK << max_bits; + y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt; + } else { + /* small d_off */ + y = ((x * max) + cnt); } + +out: + if (y_p) + *y_p = y; + + return 0; } -int32_t -afr_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries, dict_t *xdata) + +int +afr_deitransform (xlator_t *this, uint64_t y, int *subvol_p, + uint64_t *x_p) { - afr_local_t *local = NULL; + afr_private_t *conf = NULL; + int cnt = 0; + int max = 0; + uint64_t x = 0; + int subvol = 0; + int max_bits = 0; + uint64_t off_mask = 0; + uint64_t host_mask = 0; + + if (!this->private) + return -1; + + conf = this->private; + max = conf->child_count; + + if (max == 1) { + x = y; + cnt = 0; + goto out; + } + + if (y & TOP_BIT) { + /* HUGE d_off */ + max_bits = afr_bits_for (max); + off_mask = (MASK << max_bits); + host_mask = ~(off_mask); + + x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS; + + cnt = y & host_mask; + } else { + /* small d_off */ + cnt = y % max; + x = y / max; + } - if (op_ret == -1) - goto out; +out: + subvol = cnt; - local = frame->local; - afr_readdir_filter_trash_dir (entries, local->fd); + if (subvol_p) + *subvol_p = subvol; + + if (x_p) + *x_p = x; -out: - AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, NULL); return 0; } -int32_t -afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, - dict_t *xdata) +static void +afr_readdir_transform_entries (gf_dirent_t *subvol_entries, int subvol, + gf_dirent_t *entries, fd_t *fd) { - afr_local_t *local = NULL; + afr_private_t *priv = NULL; + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + int gen = 0; - if (op_ret == -1) - goto out; + priv = THIS->private; - local = frame->local; - afr_readdir_filter_trash_dir (entries, local->fd); + data_readable = alloca0 (priv->child_count); + metadata_readable = alloca0 (priv->child_count); -out: - AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, NULL); - return 0; + list_for_each_entry_safe (entry, tmp, &subvol_entries->list, list) { + if (__is_root_gfid (fd->inode->gfid) && + !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { + continue; + } + + list_del_init (&entry->list); + afr_itransform (THIS, subvol, entry->d_off, &entry->d_off); + list_add_tail (&entry->list, &entries->list); + + if (entry->inode) { + gen = 0; + afr_inode_read_subvol_get (entry->inode, THIS, + data_readable, + metadata_readable, &gen); + + if (gen != priv->event_generation || + !data_readable[subvol] || + !metadata_readable[subvol]) { + + inode_unref (entry->inode); + entry->inode = NULL; + } + } + } } + int32_t -afr_do_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, int whichop, dict_t *dict) +afr_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *subvol_entries, + dict_t *xdata) { - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - int ret = -1; - int32_t op_errno = 0; - uint64_t read_child = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_local_t *local = NULL; + gf_dirent_t entries; - priv = this->private; - children = priv->children; + INIT_LIST_HEAD (&entries.list); - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + if (op_ret < 0 && !local->cont.readdir.offset) { + /* failover only if this was first readdir, detected + by offset == 0 */ + local->op_ret = op_ret; + local->op_errno = op_errno; - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - read_child = afr_inode_get_read_ctx (this, fd->inode, - local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.readdir.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } + if (op_ret >= 0) + afr_readdir_transform_entries (subvol_entries, (long) cookie, + &entries, local->fd); - fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) { - op_errno = EBADF; - goto out; - } + AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries, xdata); - if ((offset == 0) || (fd_ctx->call_child == -1)) { - fd_ctx->call_child = call_child; - } else if ((priv->readdir_failover == _gf_false) && - (call_child != fd_ctx->call_child)) { - op_errno = EBADF; - goto out; - } + return 0; +} + + +int +afr_readdir_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + local = frame->local; - local->fd = fd_ref (fd); - local->cont.readdir.size = size; - local->cont.readdir.dict = (dict)? dict_ref (dict) : NULL; + if (subvol == -1) { + AFR_STACK_UNWIND (readdir, frame, local->op_ret, + local->op_errno, 0, 0); + return 0; + } - if (whichop == GF_FOP_READDIR) + if (local->op == GF_FOP_READDIR) STACK_WIND_COOKIE (frame, afr_readdir_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readdir, fd, - size, offset, dict); + (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->readdir, + local->fd, local->cont.readdir.size, + local->cont.readdir.offset, + local->xdata_req); else - STACK_WIND_COOKIE (frame, afr_readdirp_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readdirp, fd, - size, offset, dict); + STACK_WIND_COOKIE (frame, afr_readdir_cbk, + (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->readdirp, + local->fd, local->cont.readdir.size, + local->cont.readdir.offset, + local->xdata_req); + return 0; +} + + +int +afr_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, int whichop, dict_t *dict) +{ + afr_local_t *local = NULL; + int32_t op_errno = 0; + int subvol = -1; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + local->op = whichop; + local->fd = fd_ref (fd); + local->cont.readdir.size = size; + local->cont.readdir.offset = offset; + local->xdata_req = (dict)? dict_ref (dict) : NULL; + + if (offset == 0) { + /* First readdir has option of failing over and selecting + an appropriate read subvolume */ + afr_read_txn (frame, this, fd->inode, afr_readdir_wind, + AFR_DATA_TRANSACTION); + } else { + /* But continued readdirs MUST stick to the same subvolume + without an option to failover */ + afr_deitransform (this, offset, &subvol, + (uint64_t *)&local->cont.readdir.offset); + afr_readdir_wind (frame, this, subvol); + } return 0; out: @@ -521,7 +403,8 @@ int32_t afr_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t offset, dict_t *xdata) { - afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata); + afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata); + return 0; } @@ -531,6 +414,7 @@ afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t offset, dict_t *dict) { afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP, dict); + return 0; } @@ -538,7 +422,6 @@ afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, int32_t afr_releasedir (xlator_t *this, fd_t *fd) { - afr_forget_entries (fd); afr_cleanup_fd_ctx (this, fd); return 0; diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 1943b719bb5..465dde54f9c 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -34,10 +34,14 @@ #include "common-utils.h" #include "compat-errno.h" #include "compat.h" +#include "byte-order.h" #include "afr.h" #include "afr-transaction.h" +void +afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this); + int afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno) { @@ -56,79 +60,214 @@ afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno) *op_errno = ENOMEM; goto out; } - parent->path = gf_strdup( dirname (child_path) ); - if (!parent->path) { + + parent->path = gf_strdup (dirname (child_path)); + if (!parent->path) { if (op_errno) *op_errno = ENOMEM; goto out; } - parent->inode = inode_ref (child->parent); - uuid_copy (parent->gfid, child->pargfid); + + parent->inode = inode_ref (child->parent); + uuid_copy (parent->gfid, child->pargfid); ret = 0; out: - GF_FREE(child_path); + GF_FREE (child_path); return ret; } -void -__dir_entry_fop_common_cbk (call_frame_t *frame, int child_index, - xlator_t *this, int32_t op_ret, - int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, struct iatt *prenewparent, - struct iatt *postnewparent) + +static void +__afr_dir_write_finalize (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int inode_read_subvol = -1; + int parent_read_subvol = -1; + int parent2_read_subvol = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + if (local->inode) { + afr_replies_interpret (frame, this, local->inode); + inode_read_subvol = afr_data_subvol_get (local->inode, this, + NULL, NULL); + } + if (local->parent) + parent_read_subvol = afr_data_subvol_get (local->parent, this, + NULL, NULL); + if (local->parent2) + parent2_read_subvol = afr_data_subvol_get (local->parent2, this, + NULL, NULL); + + local->op_ret = -1; + local->op_errno = afr_final_errno (local, priv); + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret < 0) { + if (local->inode) + afr_inode_read_subvol_reset (local->inode, + this); + if (local->parent) + afr_inode_read_subvol_reset (local->parent, + this); + if (local->parent2) + afr_inode_read_subvol_reset (local->parent2, + this); + continue; + } + + if (local->op_ret == -1) { + local->op_ret = local->replies[i].op_ret; + local->op_errno = local->replies[i].op_errno; + + local->cont.dir_fop.buf = + local->replies[i].poststat; + local->cont.dir_fop.preparent = + local->replies[i].preparent; + local->cont.dir_fop.postparent = + local->replies[i].postparent; + local->cont.dir_fop.prenewparent = + local->replies[i].preparent2; + local->cont.dir_fop.postnewparent = + local->replies[i].postparent2; + if (local->replies[i].xdata) + local->xdata_rsp = + dict_ref (local->replies[i].xdata); + continue; + } + + if (i == inode_read_subvol) { + local->cont.dir_fop.buf = + local->replies[i].poststat; + if (local->replies[i].xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = + dict_ref (local->replies[i].xdata); + } + } + + if (i == parent_read_subvol) { + local->cont.dir_fop.preparent = + local->replies[i].preparent; + local->cont.dir_fop.postparent = + local->replies[i].postparent; + } + + if (i == parent2_read_subvol) { + local->cont.dir_fop.prenewparent = + local->replies[i].preparent2; + local->cont.dir_fop.postnewparent = + local->replies[i].postparent2; + } + } +} + + +static void +__afr_dir_write_fill (call_frame_t *frame, xlator_t *this, int child_index, + int op_ret, int op_errno, struct iatt *poststat, + struct iatt *preparent, struct iatt *postparent, + struct iatt *preparent2, struct iatt *postparent2, + dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_fd_ctx_t *fd_ctx = NULL; local = frame->local; + fd_ctx = local->fd_ctx; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + + if (op_ret >= 0) { + if (poststat) + local->replies[child_index].poststat = *poststat; + if (preparent) + local->replies[child_index].preparent = *preparent; + if (postparent) + local->replies[child_index].postparent = *postparent; + if (preparent2) + local->replies[child_index].preparent2 = *preparent2; + if (postparent2) + local->replies[child_index].postparent2 = *postparent2; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + + if (fd_ctx) + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + } else { + if (op_errno != ENOTEMPTY) + afr_transaction_fop_failed (frame, this, child_index); + if (fd_ctx) + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } + + return; +} + + +static int +__afr_dir_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + struct iatt *preparent2, struct iatt *postparent2, + dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = (long) cookie; + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + __afr_dir_write_fill (frame, this, child_index, op_ret, + op_errno, buf, preparent, postparent, + preparent2, postparent2, xdata); + } + UNLOCK (&frame->lock); + call_count = afr_frame_return (frame); + + if (call_count == 0) { + __afr_dir_write_finalize (frame, this); + + if (afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret > -1) { - local->op_ret = op_ret; - - if ((local->success_count == 0) || - (child_index == local->read_child_index)) { - local->cont.dir_fop.preparent = *preparent; - local->cont.dir_fop.postparent = *postparent; - if (buf) - local->cont.dir_fop.buf = *buf; - if (prenewparent) - local->cont.dir_fop.prenewparent = *prenewparent; - if (postnewparent) - local->cont.dir_fop.postnewparent = *postnewparent; - } - - local->cont.dir_fop.inode = inode; - - local->fresh_children[local->success_count] = child_index; - local->success_count++; - local->child_errno[child_index] = 0; - } else { - local->child_errno[child_index] = op_errno; + afr_mark_entry_pending_changelog (frame, this); + + local->transaction.resume (frame, this); } - local->op_errno = op_errno; + return 0; } + int afr_mark_new_entry_changelog_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, + xlator_t *this, int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - int call_count = 0; + int call_count = 0; call_count = afr_frame_return (frame); - if (call_count == 0) { + + if (call_count == 0) AFR_STACK_DESTROY (frame); - } + return 0; } + void afr_mark_new_entry_changelog (call_frame_t *frame, xlator_t *this) { @@ -136,125 +275,109 @@ afr_mark_new_entry_changelog (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; afr_local_t *new_local = NULL; afr_private_t *priv = NULL; - dict_t **xattr = NULL; + dict_t *xattr = NULL; int32_t **changelog = NULL; int i = 0; - GF_UNUSED int op_errno = 0; + int idx = 0; + int op_errno = ENOMEM; + unsigned char *pending = NULL; + int call_count = 0; local = frame->local; priv = this->private; new_frame = copy_frame (frame); - if (!new_frame) { + if (!new_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (new_frame->local, out); - new_local = new_frame->local; + new_local = AFR_FRAME_INIT (new_frame, op_errno); + if (!new_local) + goto out; + changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); if (!changelog) goto out; - xattr = GF_CALLOC (priv->child_count, sizeof (*xattr), - gf_afr_mt_dict_t); - if (!xattr) - goto out; - for (i = 0; i < priv->child_count; i++) { - if (local->child_errno[i]) - continue; - xattr[i] = dict_new (); - if (!xattr[i]) - goto out; - } + xattr = dict_new (); + if (!xattr) + goto out; + + idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - afr_prepare_new_entry_pending_matrix (changelog, - afr_is_errno_set, - local->child_errno, - &local->cont.dir_fop.buf, - priv->child_count); + pending = alloca0 (priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] && + !local->transaction.failed_subvols[i]) { + call_count ++; + continue; + } + + changelog[i][idx] = hton32(1); + pending[i] = 1; + } new_local->pending = changelog; uuid_copy (new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid); - new_local->loc.inode = inode_ref (local->cont.dir_fop.inode); - new_local->call_count = local->success_count; + new_local->loc.inode = inode_ref (local->inode); + + + afr_set_pending_dict (priv, xattr, changelog); + + new_local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_errno[i]) + if (pending[i]) continue; - afr_set_pending_dict (priv, xattr[i], changelog, i, LOCAL_LAST); STACK_WIND_COOKIE (new_frame, afr_mark_new_entry_changelog_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->xattrop, &new_local->loc, GF_XATTROP_ADD_ARRAY, - xattr[i], NULL); + xattr, NULL); + if (!--call_count) + break; } + new_frame = NULL; out: if (new_frame) AFR_STACK_DESTROY (new_frame); - afr_xattr_array_destroy (xattr, priv->child_count); + if (xattr) + dict_unref (xattr); return; } -gf_boolean_t -afr_is_new_entry_changelog_needed (glusterfs_fop_t fop) -{ - glusterfs_fop_t fops[] = {GF_FOP_CREATE, GF_FOP_MKNOD, GF_FOP_NULL}; - int i = 0; - - for (i = 0; fops[i] != GF_FOP_NULL; i++) { - if (fop == fops[i]) - return _gf_true; - } - return _gf_false; -} void -afr_dir_fop_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this) +afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int pre_op_count = 0; + int failed_count = 0; local = frame->local; priv = this->private; if (local->op_ret < 0) - goto out; + return; - if (local->success_count == priv->child_count) - goto out; + if (local->op != GF_FOP_CREATE && local->op != GF_FOP_MKNOD) + return; - if (!afr_is_new_entry_changelog_needed (local->op)) - goto out; + pre_op_count = AFR_COUNT (local->transaction.pre_op, priv->child_count); + failed_count = AFR_COUNT (local->transaction.failed_subvols, + priv->child_count); + + if (pre_op_count == priv->child_count && !failed_count) + return; afr_mark_new_entry_changelog (frame, this); -out: return; } -void -afr_dir_fop_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - priv = this->private; - - if (local->cont.dir_fop.inode == NULL) - goto done; - afr_set_read_ctx_from_policy (this, local->cont.dir_fop.inode, - local->fresh_children, - local->read_child_index, - priv->read_child, - local->cont.dir_fop.buf.ia_gfid); -done: - local->transaction.unwind (frame, this); - afr_dir_fop_mark_entry_pending_changelog (frame, this); - local->transaction.resume (frame, this); -} /* {{{ create */ @@ -266,26 +389,16 @@ afr_create_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (create, main_frame, - local->op_ret, local->op_errno, - local->cont.create.fd, - local->cont.dir_fop.inode, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - local->xdata_rsp); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + + AFR_STACK_UNWIND (create, main_frame, local->op_ret, local->op_errno, + local->cont.create.fd, local->inode, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -297,175 +410,79 @@ afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int ret = 0; - int call_count = -1; - int child_index = -1; - - local = frame->local; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret > -1) { - ret = afr_fd_ctx_set (this, fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not set ctx on fd=%p", fd); - - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } - - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not get fd ctx for fd=%p", fd); - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - fd_ctx->opened_on[child_index] = AFR_FD_OPENED; - fd_ctx->flags = local->cont.create.flags; - - if (local->success_count == 0) { - if (xdata) - local->xdata_rsp = dict_ref(xdata); - } - } - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, inode, buf, - preparent, postparent, NULL, NULL); - } - -unlock: - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } int -afr_create_wind (call_frame_t *frame, xlator_t *this) +afr_create_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_create_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->create, - &local->loc, - local->cont.create.flags, - local->cont.create.mode, - local->umask, - local->cont.create.fd, - local->xdata_req); - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_create_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_create_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->create, + &local->loc, local->cont.create.flags, + local->cont.create.mode, local->umask, + local->cont.create.fd, local->xdata_req); return 0; } int -afr_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - mode_t umask, fd_t *fd, dict_t *params) +afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(create,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, loc); - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + local->fd_ctx = afr_fd_ctx_get (fd, this); + if (!local->fd_ctx) + goto out; + + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); local->op = GF_FOP_CREATE; local->cont.create.flags = flags; local->cont.create.mode = mode; local->cont.create.fd = fd_ref (fd); local->umask = umask; - if (params) - local->xdata_req = dict_ref (params); - local->transaction.fop = afr_create_wind; - local->transaction.done = afr_create_done; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_create_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_create_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -492,15 +509,13 @@ afr_create (call_frame_t *frame, xlator_t *this, goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); return 0; } @@ -516,25 +531,14 @@ afr_mknod_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (mknod, main_frame, - local->op_ret, local->op_errno, - local->cont.dir_fop.inode, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (mknod, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -545,131 +549,72 @@ afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - int call_count = -1; - int child_index = -1; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, inode, buf, - preparent, postparent, NULL, NULL); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } -int32_t -afr_mknod_wind (call_frame_t *frame, xlator_t *this) +int +afr_mknod_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mknod, - &local->loc, local->cont.mknod.mode, - local->cont.mknod.dev, - local->umask, - local->xdata_req); - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_mknod_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->mknod, + &local->loc, local->cont.mknod.mode, + local->cont.mknod.dev, local->umask, + local->xdata_req); return 0; } - int afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t dev, mode_t umask, dict_t *params) + dev_t dev, mode_t umask, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(mknod,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, loc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); local->op = GF_FOP_MKNOD; local->cont.mknod.mode = mode; local->cont.mknod.dev = dev; local->umask = umask; - if (params) - local->xdata_req = dict_ref (params); - local->transaction.fop = afr_mknod_wind; - local->transaction.done = afr_mknod_done; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_mknod_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_mknod_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -692,19 +637,17 @@ afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, int_lock->lockee_count++; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (mknod, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); return 0; } @@ -721,25 +664,14 @@ afr_mkdir_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (mkdir, main_frame, - local->op_ret, local->op_errno, - local->cont.dir_fop.inode, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (mkdir, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -750,130 +682,71 @@ afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - int call_count = -1; - int child_index = -1; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, inode, buf, - preparent, postparent, NULL, NULL); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } int -afr_mkdir_wind (call_frame_t *frame, xlator_t *this) +afr_mkdir_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mkdir, - &local->loc, local->cont.mkdir.mode, - local->umask, - local->xdata_req); - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->mkdir, &local->loc, + local->cont.mkdir.mode, local->umask, + local->xdata_req); return 0; } int -afr_mkdir_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - -int -afr_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, mode_t umask, dict_t *params) +afr_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(mkdir,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, loc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); local->cont.mkdir.mode = mode; local->umask = umask; - if (params) - local->xdata_req = dict_ref (params); + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; local->op = GF_FOP_MKDIR; - local->transaction.fop = afr_mkdir_wind; - local->transaction.done = afr_mkdir_done; + local->transaction.wind = afr_mkdir_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_mkdir_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -896,20 +769,17 @@ afr_mkdir (call_frame_t *frame, xlator_t *this, int_lock->lockee_count++; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - - AFR_STACK_UNWIND (mkdir, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); return 0; } @@ -926,25 +796,14 @@ afr_link_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (link, main_frame, - local->op_ret, local->op_errno, - local->cont.dir_fop.inode, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (link, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -955,127 +814,70 @@ afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - int call_count = -1; - int child_index = -1; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, inode, buf, - preparent, postparent, NULL, NULL); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } int -afr_link_wind (call_frame_t *frame, xlator_t *this) +afr_link_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_link_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->link, - &local->loc, - &local->newloc, local->xdata_req); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->link, + &local->loc, &local->newloc, local->xdata_req); return 0; } int -afr_link_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - - -int -afr_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdata) +afr_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(link,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, oldloc); loc_copy (&local->newloc, newloc); + + local->inode = inode_ref (oldloc->inode); + local->parent = inode_ref (newloc->parent); + if (xdata) - local->xdata_req = dict_ref (xdata); + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + if (!local->xdata_req) + goto out; local->op = GF_FOP_LINK; - local->transaction.fop = afr_link_wind; - local->transaction.done = afr_link_done; + + local->transaction.wind = afr_link_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_link_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, newloc, @@ -1098,18 +900,17 @@ afr_link (call_frame_t *frame, xlator_t *this, int_lock->lockee_count++; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (link, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); return 0; } @@ -1126,25 +927,14 @@ afr_symlink_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (symlink, main_frame, - local->op_ret, local->op_errno, - local->cont.dir_fop.inode, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (symlink, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -1155,132 +945,71 @@ afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - int call_count = -1; - int child_index = -1; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, inode, buf, - preparent, postparent, NULL, NULL); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } int -afr_symlink_wind (call_frame_t *frame, xlator_t *this) +afr_symlink_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->symlink, - local->cont.symlink.linkpath, - &local->loc, - local->umask, - local->xdata_req); - - if (!--call_count) - break; - - } - } - - return 0; -} - - -int -afr_symlink_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->symlink, + local->cont.symlink.linkpath, &local->loc, + local->umask, local->xdata_req); return 0; } int -afr_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *loc, mode_t umask, dict_t *params) +afr_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(symlink,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, loc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); local->cont.symlink.linkpath = gf_strdup (linkpath); local->umask = umask; - if (params) - local->xdata_req = dict_ref (params); + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; local->op = GF_FOP_SYMLINK; - local->transaction.fop = afr_symlink_wind; - local->transaction.done = afr_symlink_done; + local->transaction.wind = afr_symlink_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_symlink_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -1303,19 +1032,17 @@ afr_symlink (call_frame_t *frame, xlator_t *this, int_lock->lockee_count++; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (symlink, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (symlink, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL); return 0; } @@ -1331,26 +1058,16 @@ afr_rename_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (rename, main_frame, - local->op_ret, local->op_errno, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - &local->cont.dir_fop.prenewparent, - &local->cont.dir_fop.postnewparent, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (rename, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + &local->cont.dir_fop.prenewparent, + &local->cont.dir_fop.postnewparent, local->xdata_rsp); return 0; } @@ -1362,131 +1079,72 @@ afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *prenewparent, struct iatt *postnewparent, dict_t *xdata) { - afr_local_t * local = NULL; - int call_count = -1; - int child_index = -1; - - local = frame->local; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno) && op_errno != ENOTEMPTY) - afr_transaction_fop_failed (frame, this, child_index); - local->op_errno = op_errno; - local->child_errno[child_index] = op_errno; - - if (op_ret > -1) - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, NULL, buf, - preoldparent, postoldparent, - prenewparent, postnewparent); - - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preoldparent, postoldparent, prenewparent, + postnewparent, xdata); } -int32_t -afr_rename_wind (call_frame_t *frame, xlator_t *this) +int +afr_rename_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - local = frame->local; - priv = this->private; - - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->rename, - &local->loc, - &local->newloc, NULL); - if (!--call_count) - break; - } - } + local = frame->local; + priv = this->private; + STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->rename, + &local->loc, &local->newloc, local->xdata_req); return 0; } int -afr_rename_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - - -int -afr_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdata) +afr_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; + int op_errno = ENOMEM; int nlockee = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; QUORUM_CHECK(rename,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { + if (!transaction_frame) op_errno = ENOMEM; - goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, oldloc); loc_copy (&local->newloc, newloc); - local->read_child_index = afr_inode_get_read_ctx (this, oldloc->inode, NULL); + local->inode = inode_ref (oldloc->inode); + local->parent = inode_ref (oldloc->parent); + local->parent2 = inode_ref (newloc->parent); + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; local->op = GF_FOP_RENAME; - local->transaction.fop = afr_rename_wind; - local->transaction.done = afr_rename_done; + local->transaction.wind = afr_rename_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_rename_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, oldloc, @@ -1536,20 +1194,17 @@ afr_rename (call_frame_t *frame, xlator_t *this, ret = afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - - AFR_STACK_UNWIND (rename, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); return 0; } @@ -1565,23 +1220,13 @@ afr_unlink_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (unlink, main_frame, - local->op_ret, local->op_errno, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (unlink, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -1591,123 +1236,69 @@ afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - afr_local_t * local = NULL; - int call_count = -1; - int child_index = (long) cookie; - - local = frame->local; - - LOCK (&frame->lock); - { - if (child_index == local->read_child_index) { - local->read_child_returned = _gf_true; - } - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, NULL, NULL, - preparent, postparent, NULL, NULL); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL, + preparent, postparent, NULL, NULL, xdata); } -int32_t -afr_unlink_wind (call_frame_t *frame, xlator_t *this) +int +afr_unlink_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->unlink, - &local->loc, local->xflag, - local->xdata_req); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int32_t -afr_unlink_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->unlink, + &local->loc, local->xflag, local->xdata_req); return 0; } -int32_t -afr_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, int xflag, dict_t *xdata) +int +afr_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(unlink,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, loc); local->xflag = xflag; + + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); + if (xdata) - local->xdata_req = dict_ref (xdata); + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; local->op = GF_FOP_UNLINK; - local->transaction.fop = afr_unlink_wind; - local->transaction.done = afr_unlink_done; + local->transaction.wind = afr_unlink_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_unlink_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -1730,19 +1321,16 @@ afr_unlink (call_frame_t *frame, xlator_t *this, int_lock->lockee_count++; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (unlink, frame, -1, op_errno, - NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -1760,23 +1348,13 @@ afr_rmdir_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (rmdir, main_frame, - local->op_ret, local->op_errno, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (rmdir, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -1786,130 +1364,71 @@ afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - afr_local_t * local = NULL; - int call_count = -1; - int child_index = (long) cookie; - int read_child = 0; - - local = frame->local; - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - if (afr_fop_failed (op_ret, op_errno) && (op_errno != ENOTEMPTY)) - afr_transaction_fop_failed (frame, this, child_index); - local->op_errno = op_errno; - local->child_errno[child_index] = op_errno; - if (op_ret > -1) - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, NULL, NULL, - preparent, postparent, NULL, - NULL); - - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL, + preparent, postparent, NULL, NULL, xdata); } int -afr_rmdir_wind (call_frame_t *frame, xlator_t *this) +afr_rmdir_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->rmdir, - &local->loc, local->cont.rmdir.flags, - NULL); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_rmdir_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->rmdir, + &local->loc, local->cont.rmdir.flags, local->xdata_req); return 0; } int -afr_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags, dict_t *xdata) +afr_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; + int op_errno = ENOMEM; int nlockee = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; QUORUM_CHECK(rmdir,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - local->cont.rmdir.flags = flags; loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); + + local->cont.rmdir.flags = flags; + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; local->op = GF_FOP_RMDIR; - local->transaction.fop = afr_rmdir_wind; - local->transaction.done = afr_rmdir_done; + local->transaction.wind = afr_rmdir_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_rmdir_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -1944,18 +1463,16 @@ afr_rmdir (call_frame_t *frame, xlator_t *this, ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL); return 0; } diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index 0cfebcb9d55..01e078c13e6 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -35,241 +35,153 @@ #include "compat-errno.h" #include "compat.h" -/** - * Common algorithm for inode read calls: - * - * - Try the fop on the first child that is up - * - if we have failed due to ENOTCONN: - * try the next child - * - * Applicable to: access, stat, fstat, readlink, getxattr - */ +#include "afr-transaction.h" + /* {{{ access */ -int32_t -afr_access_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) +int +afr_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; + afr_local_t *local = NULL; local = frame->local; - read_child = (long) cookie; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - if (op_ret == -1) { - last_index = &local->cont.access.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - unwind = 0; - - STACK_WIND_COOKIE (frame, afr_access_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->access, - &local->loc, local->cont.access.mask, - NULL); - } - -out: - if (unwind) { - AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata); - } + AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata); return 0; } -int32_t -afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, - dict_t *xdata) +int +afr_access_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - int32_t op_errno = 0; - int32_t read_child = -1; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; - - AFR_SBRAIN_CHECK_LOC (loc, out); - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (subvol == -1) { + AFR_STACK_UNWIND (access, frame, local->op_ret, + local->op_errno, 0); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_access_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->access, + &local->loc, local->cont.access.mask, + local->xdata_req); + return 0; +} +int +afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, + int mask, dict_t *xdata) +{ + afr_local_t *local = NULL; + int op_errno = 0; - read_child = afr_inode_get_read_ctx (this, loc->inode, - local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.access.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - loc_copy (&local->loc, loc); - local->cont.access.mask = mask; + local->op = GF_FOP_ACCESS; + loc_copy (&local->loc, loc); + local->cont.access.mask = mask; + if (xdata) + local->xdata_req = dict_ref (xdata); - STACK_WIND_COOKIE (frame, afr_access_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->access, - loc, mask, xdata); + afr_read_txn (frame, this, loc->inode, afr_access_wind, + AFR_METADATA_TRANSACTION); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL); + return 0; } - /* }}} */ /* {{{ stat */ -int32_t +int afr_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; - - read_child = (long) cookie; + afr_local_t *local = NULL; local = frame->local; - if (op_ret == -1) { - last_index = &local->cont.stat.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - unwind = 0; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - STACK_WIND_COOKIE (frame, afr_stat_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->stat, - &local->loc, NULL); - } - -out: - if (unwind) { - AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); - } + AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); return 0; } -int32_t -afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +int +afr_stat_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int call_child = 0; - int32_t op_errno = 0; - int32_t read_child = -1; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; - - AFR_SBRAIN_CHECK_LOC (loc, out); - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (subvol == -1) { + AFR_STACK_UNWIND (stat, frame, local->op_ret, local->op_errno, + 0, 0); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->stat, + &local->loc, local->xdata_req); + return 0; +} - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; +int +afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + afr_local_t *local = NULL; + int op_errno = 0; - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - read_child = afr_inode_get_read_ctx (this, loc->inode, - local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.stat.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } - loc_copy (&local->loc, loc); + local->op = GF_FOP_STAT; + loc_copy (&local->loc, loc); + if (xdata) + local->xdata_req = dict_ref (xdata); - STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child, - children[call_child], - children[call_child]->fops->stat, - loc, xdata); + afr_read_txn (frame, this, loc->inode, afr_stat_wind, + AFR_DATA_TRANSACTION); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); return 0; } @@ -279,52 +191,49 @@ out: /* {{{ fstat */ -int32_t +int afr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; + afr_local_t *local = NULL; local = frame->local; - read_child = (long) cookie; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - if (op_ret == -1) { - last_index = &local->cont.fstat.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - unwind = 0; + AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); - STACK_WIND_COOKIE (frame, afr_fstat_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->fstat, - local->fd, NULL); - } + return 0; +} -out: - if (unwind) { - AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); - } - return 0; +int +afr_fstat_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (subvol == -1) { + AFR_STACK_UNWIND (fstat, frame, local->op_ret, local->op_errno, + 0, 0); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fstat, + local->fd, local->xdata_req); + return 0; } @@ -332,68 +241,26 @@ int32_t afr_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int call_child = 0; - int32_t op_errno = 0; - int32_t read_child = 0; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; - - VALIDATE_OR_GOTO (fd->inode, out); - - AFR_SBRAIN_CHECK_FD (fd, out); - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } + afr_local_t *local = NULL; + int op_errno = 0; - read_child = afr_inode_get_read_ctx (this, fd->inode, - local->fresh_children); + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + local->op = GF_FOP_FSTAT; + local->fd = fd_ref (fd); + if (xdata) + local->xdata_req = dict_ref (xdata); + afr_fix_open (fd, this); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.fstat.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - local->fd = fd_ref (fd); - - afr_open_fd_fix (fd, this); + afr_read_txn (frame, this, fd->inode, afr_fstat_wind, + AFR_DATA_TRANSACTION); - STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child, - children[call_child], - children[call_child]->fops->fstat, - fd, xdata); - - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); return 0; } @@ -402,117 +269,77 @@ out: /* {{{ readlink */ -int32_t +int afr_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, const char *buf, struct iatt *sbuf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; + afr_local_t *local = NULL; - priv = this->private; - children = priv->children; + local = frame->local; - local = frame->local; + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - read_child = (long) cookie; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - if (op_ret == -1) { - last_index = &local->cont.readlink.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; - - unwind = 0; - STACK_WIND_COOKIE (frame, afr_readlink_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->readlink, - &local->loc, - local->cont.readlink.size, NULL); - } - -out: - if (unwind) { - AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, buf, sbuf, - xdata); - } + AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, + buf, sbuf, xdata); + return 0; +} - return 0; +int +afr_readlink_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND (readlink, frame, local->op_ret, + local->op_errno, 0, 0, 0); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_readlink_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->readlink, + &local->loc, local->cont.readlink.size, + local->xdata_req); + return 0; } -int32_t +int afr_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, dict_t *xdata) { - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; - afr_local_t *local = NULL; + afr_local_t * local = NULL; int32_t op_errno = 0; - int32_t read_child = -1; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - children = priv->children; - - AFR_SBRAIN_CHECK_LOC (loc, out); - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } - read_child = afr_inode_get_read_ctx (this, loc->inode, - local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.readlink.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + local->op = GF_FOP_READLINK; loc_copy (&local->loc, loc); + local->cont.readlink.size = size; + if (xdata) + local->xdata_req = dict_ref (xdata); - local->cont.readlink.size = size; - - STACK_WIND_COOKIE (frame, afr_readlink_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readlink, - loc, size, xdata); + afr_read_txn (frame, this, loc->inode, afr_readlink_wind, + AFR_DATA_TRANSACTION); - ret = 0; -out: - if (ret < 0) - AFR_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL, NULL); return 0; +out: + AFR_STACK_UNWIND(readlink, frame, -1, op_errno, 0, 0, 0); + + return 0; } @@ -550,7 +377,7 @@ __gather_xattr_keys (dict_t *dict, char *key, data_t *value, void -__filter_xattrs (dict_t *dict) +afr_filter_xattrs (dict_t *dict) { struct list_head keys = {0,}; struct _xattr_key *key = NULL; @@ -571,59 +398,56 @@ __filter_xattrs (dict_t *dict) } - -int32_t +int afr_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; + afr_local_t *local = NULL; local = frame->local; - read_child = (long) cookie; - - if (op_ret == -1) { - last_index = &local->cont.getxattr.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - unwind = 0; - STACK_WIND_COOKIE (frame, afr_getxattr_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->getxattr, - &local->loc, - local->cont.getxattr.name, - NULL); - } + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } -out: - if (unwind) { - if (op_ret >= 0 && dict) - __filter_xattrs (dict); + if (dict) + afr_filter_xattrs (dict); - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); - } + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); return 0; } + +int +afr_getxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND (getxattr, frame, local->op_ret, + local->op_errno, NULL, NULL); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_getxattr_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->getxattr, + &local->loc, local->cont.getxattr.name, + local->xdata_req); + return 0; +} + + int32_t afr_getxattr_unwind (call_frame_t *frame, int op_ret, int op_errno, dict_t *dict, dict_t *xdata) @@ -659,7 +483,7 @@ afr_fgetxattr_clrlk_cbk (call_frame_t *frame, void *cookie, { callcnt = --local->call_count; if (op_ret == -1) - local->child_errno[cky] = op_errno; + local->replies[cky].op_errno = op_errno; if (!local->dict) local->dict = dict_new (); @@ -710,12 +534,10 @@ unlock: unwind: // Updating child_errno with more recent 'events' - local->child_errno[cky] = op_errno; - op_errno = afr_resultant_errno_get (NULL, local->child_errno, - priv->child_count); + op_errno = afr_final_errno (local, priv); + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr, xdata); - if (xattr) dict_unref (xattr); } @@ -749,7 +571,7 @@ afr_getxattr_clrlk_cbk (call_frame_t *frame, void *cookie, { callcnt = --local->call_count; if (op_ret == -1) - local->child_errno[cky] = op_errno; + local->replies[cky].op_errno = op_errno; if (!local->dict) local->dict = dict_new (); @@ -800,9 +622,8 @@ unlock: unwind: // Updating child_errno with more recent 'events' - local->child_errno[cky] = op_errno; - op_errno = afr_resultant_errno_get (NULL, local->child_errno, - priv->child_count); + op_errno = afr_final_errno (local, priv); + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata); if (xattr) @@ -1411,7 +1232,7 @@ afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk, } if (!strcmp (name, GF_XATTR_PATHINFO_KEY) || - !strcmp (name, GF_XATTR_USER_PATHINFO_KEY)) { + !strcmp (name, GF_XATTR_USER_PATHINFO_KEY)) { if (is_fgetxattr) { *cbk = afr_fgetxattr_pathinfo_cbk; } else { @@ -1442,18 +1263,16 @@ out: } static void -afr_getxattr_frm_all_children (xlator_t *this, call_frame_t *frame, - const char *name, loc_t *loc, - fop_getxattr_cbk_t cbk) +afr_getxattr_all_subvols (xlator_t *this, call_frame_t *frame, + const char *name, loc_t *loc, + fop_getxattr_cbk_t cbk) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - xlator_t **children = NULL; int i = 0; int call_count = 0; priv = this->private; - children = priv->children; local = frame->local; //local->call_count set in afr_local_init @@ -1465,8 +1284,8 @@ afr_getxattr_frm_all_children (xlator_t *this, call_frame_t *frame, for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, cbk, - (void *) (long) i, children[i], - children[i]->fops->getxattr, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->getxattr, loc, name, NULL); if (!--call_count) break; @@ -1481,41 +1300,41 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, { afr_private_t *priv = NULL; xlator_t **children = NULL; - int call_child = 0; afr_local_t *local = NULL; xlator_list_t *trav = NULL; xlator_t **sub_volumes = NULL; int i = 0; int32_t op_errno = 0; - int32_t read_child = -1; int ret = -1; fop_getxattr_cbk_t cbk = NULL; int afr_xtime_gauge[MCNT_MAX] = {0,}; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); children = priv->children; - AFR_SBRAIN_CHECK_LOC (loc, out); + loc_copy (&local->loc, loc); - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + local->op = GF_FOP_GETXATTR; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + if (xdata) + local->xdata_req = dict_ref (xdata); - loc_copy (&local->loc, loc); if (!name) goto no_name; local->cont.getxattr.name = gf_strdup (name); + if (!local->cont.getxattr.name) { + op_errno = ENOMEM; + goto out; + } + if (!strncmp (name, AFR_XATTR_PREFIX, strlen (AFR_XATTR_PREFIX))) { gf_log (this->name, GF_LOG_INFO, @@ -1559,8 +1378,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, * collect information from all childs */ if (afr_is_special_xattr (name, &cbk, 0)) { - afr_getxattr_frm_all_children (this, frame, name, - loc, cbk); + afr_getxattr_all_subvols (this, frame, name, loc, cbk); return 0; } @@ -1615,28 +1433,9 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, } no_name: - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } - read_child = afr_inode_get_read_ctx (this, loc->inode, - local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.getxattr.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - STACK_WIND_COOKIE (frame, afr_getxattr_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->getxattr, - loc, name, xdata); + afr_read_txn (frame, this, local->loc.inode, afr_getxattr_wind, + AFR_METADATA_TRANSACTION); ret = 0; out: @@ -1653,76 +1452,60 @@ afr_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; - - local = frame->local; + afr_local_t *local = NULL; - read_child = (long) cookie; + local = frame->local; - if (op_ret == -1) { - last_index = &local->cont.getxattr.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - unwind = 0; - STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->fgetxattr, - local->fd, - local->cont.getxattr.name, - NULL); - } + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } -out: - if (unwind) { - if (op_ret >= 0 && dict) - __filter_xattrs (dict); + if (dict) + afr_filter_xattrs (dict); - AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, - xdata); - } + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata); - return 0; + return 0; } -int32_t -afr_fgetxattr_unwind (call_frame_t *frame, - int op_ret, int op_errno, dict_t *dict, dict_t *xdata) - +int +afr_fgetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { - AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata); - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND (fgetxattr, frame, local->op_ret, + local->op_errno, NULL, NULL); + return 0; + } + + STACK_WIND_COOKIE (frame, (void *) (long) subvol, afr_fgetxattr_cbk, + priv->children[subvol], + priv->children[subvol]->fops->fgetxattr, + local->fd, local->cont.getxattr.name, + local->xdata_req); + return 0; } + static void -afr_fgetxattr_frm_all_children (xlator_t *this, call_frame_t *frame, - const char *name, fd_t *fd, - fop_fgetxattr_cbk_t cbk) +afr_fgetxattr_all_subvols (xlator_t *this, call_frame_t *frame, + fop_fgetxattr_cbk_t cbk) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - xlator_t **children = NULL; int i = 0; int call_count = 0; priv = this->private; - children = priv->children; local = frame->local; //local->call_count set in afr_local_init @@ -1735,9 +1518,10 @@ afr_fgetxattr_frm_all_children (xlator_t *this, call_frame_t *frame, if (local->child_up[i]) { STACK_WIND_COOKIE (frame, cbk, (void *) (long) i, - children[i], - children[i]->fops->fgetxattr, - fd, name, NULL); + priv->children[i], + priv->children[i]->fops->fgetxattr, + local->fd, local->cont.getxattr.name, + NULL); if (!--call_count) break; } @@ -1746,42 +1530,30 @@ afr_fgetxattr_frm_all_children (xlator_t *this, call_frame_t *frame, return; } -int32_t + +int afr_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, dict_t *xdata) { - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; afr_local_t *local = NULL; - int32_t op_ret = -1; int32_t op_errno = 0; - int32_t read_child = -1; fop_fgetxattr_cbk_t cbk = NULL; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; - - AFR_SBRAIN_CHECK_FD (fd, out); - - AFR_LOCAL_ALLOC_OR_GOTO (local, out); - frame->local = local; - - op_ret = afr_local_init (local, priv, &op_errno); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + local->op = GF_FOP_FGETXATTR; local->fd = fd_ref (fd); - if (name) + if (name) { local->cont.getxattr.name = gf_strdup (name); + if (!local->cont.getxattr.name) { + op_errno = ENOMEM; + goto out; + } + } + if (xdata) + local->xdata_req = dict_ref (xdata); /* pathinfo gets handled only in getxattr(), but we need to handle * lockinfo. @@ -1789,42 +1561,19 @@ afr_fgetxattr (call_frame_t *frame, xlator_t *this, * collect information from all children. */ if (afr_is_special_xattr (name, &cbk, 1)) { - afr_fgetxattr_frm_all_children (this, frame, name, - fd, cbk); + afr_fgetxattr_all_subvols (this, frame, cbk); return 0; } + afr_fix_open (fd, this); - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } - - read_child = afr_inode_get_read_ctx (this, fd->inode, - local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.getxattr.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } - - STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->fgetxattr, - fd, name, xdata); + afr_read_txn (frame, this, fd->inode, afr_fgetxattr_wind, + AFR_METADATA_TRANSACTION); - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, NULL, - NULL); - } + AFR_STACK_UNWIND (fgetxattr, frame, -1, op_errno, NULL, NULL); + return 0; } @@ -1833,144 +1582,84 @@ out: /* {{{ readv */ -/** - * read algorithm: - * - * if the user has specified a read subvolume, use it - * otherwise - - * use the inode number to hash it to one of the subvolumes, and - * read from there (to balance read load) - * - * if any of the above read's fail, try the children in sequence - * beginning at the beginning - */ - -int32_t +int afr_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, int32_t count, struct iatt *buf, struct iobref *iobref, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t *fresh_children = NULL; - int32_t read_child = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - read_child = (long) cookie; + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - if (op_ret == -1) { - last_index = &local->cont.readv.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; - - unwind = 0; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - STACK_WIND_COOKIE (frame, afr_readv_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->readv, - local->fd, local->cont.readv.size, - local->cont.readv.offset, - local->cont.readv.flags, - NULL); - } + AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, + vector, count, buf, iobref, xdata); + return 0; +} -out: - if (unwind) { - AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, - vector, count, buf, iobref, xdata); - } - return 0; +int +afr_readv_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND (readv, frame, local->op_ret, local->op_errno, + 0, 0, 0, 0, 0); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_readv_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->readv, + local->fd, local->cont.readv.size, + local->cont.readv.offset, local->cont.readv.flags, + local->xdata_req); + return 0; } -int32_t -afr_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata) +int +afr_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - afr_private_t * priv = NULL; afr_local_t * local = NULL; - xlator_t ** children = NULL; - int call_child = 0; int32_t op_errno = 0; - int32_t read_child = -1; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - children = priv->children; - AFR_SBRAIN_CHECK_FD (fd, out); + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } - - read_child = afr_inode_get_read_ctx (this, fd->inode, local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.readv.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - local->fd = fd_ref (fd); - - local->cont.readv.size = size; - local->cont.readv.offset = offset; - local->cont.readv.flags = flags; + local->op = GF_FOP_READ; + local->fd = fd_ref (fd); + local->cont.readv.size = size; + local->cont.readv.offset = offset; + local->cont.readv.flags = flags; + if (xdata) + local->xdata_req = dict_ref (xdata); - afr_open_fd_fix (fd, this); + afr_fix_open (fd, this); - STACK_WIND_COOKIE (frame, afr_readv_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readv, - fd, size, offset, flags, xdata); + afr_read_txn (frame, this, fd->inode, afr_readv_wind, + AFR_DATA_TRANSACTION); - ret = 0; -out: - if (ret < 0) { - AFR_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, - NULL, NULL); - } return 0; +out: + AFR_STACK_UNWIND(readv, frame, -1, op_errno, 0, 0, 0, 0, 0); + + return 0; } /* }}} */ diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index d62847defa3..3dacfc8dd5d 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -37,46 +37,128 @@ #include "afr.h" #include "afr-transaction.h" -#include "afr-self-heal-common.h" +//#include "afr-self-heal-common.h" -void -__inode_write_fop_cbk (call_frame_t *frame, int child_index, int read_child, - xlator_t *this, int32_t *op_ret, int32_t *op_errno, - struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) + +static void +__afr_inode_write_finalize (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int read_subvol = 0; + int i = 0; + + local = frame->local; + priv = this->private; + + if (local->inode) { + if (local->transaction.type == AFR_METADATA_TRANSACTION) + read_subvol = afr_metadata_subvol_get (local->inode, this, + NULL, NULL); + else + read_subvol = afr_data_subvol_get (local->inode, this, + NULL, NULL); + } + + local->op_ret = -1; + local->op_errno = afr_final_errno (local, priv); + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret < 0) { + afr_inode_read_subvol_reset (local->inode, this); + continue; + } + + /* Order of checks in the compound conditional + below is important. + + - Highest precedence: largest op_ret + - Next precendence: if all op_rets are equal, read subvol + - Least precedence: any succeeded subvol + */ + if ((local->op_ret < local->replies[i].op_ret) || + ((local->op_ret == local->replies[i].op_ret) && + (i == read_subvol))) { + + local->op_ret = local->replies[i].op_ret; + local->op_errno = local->replies[i].op_errno; + + local->cont.inode_wfop.prebuf = + local->replies[i].prestat; + local->cont.inode_wfop.postbuf = + local->replies[i].poststat; + + if (local->replies[i].xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = + dict_ref (local->replies[i].xdata); + } + } + } +} + + +static void +__afr_inode_write_fill (call_frame_t *frame, xlator_t *this, int child_index, + int op_ret, int op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; local = frame->local; - if (afr_fop_failed (*op_ret, *op_errno)) { - local->child_errno[child_index] = *op_errno; - - switch (local->op) { - case GF_FOP_TRUNCATE: - case GF_FOP_FTRUNCATE: - if (*op_errno != EFBIG) - afr_transaction_fop_failed (frame, this, - child_index); - break; - default: - afr_transaction_fop_failed (frame, this, child_index); - break; - } - local->op_errno = *op_errno; - goto out; + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + + if (op_ret >= 0) { + if (prebuf) + local->replies[child_index].prestat = *prebuf; + if (postbuf) + local->replies[child_index].poststat = *postbuf; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + } else { + afr_transaction_fop_failed (frame, this, child_index); + } + + return; +} + + +static int +__afr_inode_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = (long) cookie; + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + __afr_inode_write_fill (frame, this, child_index, op_ret, + op_errno, prebuf, postbuf, xdata); } + UNLOCK (&frame->lock); - if ((local->success_count == 0) || (read_child == child_index)) { - local->op_ret = *op_ret; - if (prebuf) - local->cont.inode_wfop.prebuf = *prebuf; - if (postbuf) - local->cont.inode_wfop.postbuf = *postbuf; + call_count = afr_frame_return (frame); + + if (call_count == 0) { + __afr_inode_write_finalize (frame, this); + + if (afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); } - local->success_count++; -out: - return; + return 0; } /* {{{ writev */ @@ -94,6 +176,8 @@ afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame) dst_local->op_errno = src_local->op_errno; dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf; dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf; + if (src_local->xdata_rsp) + dst_local->xdata_rsp = dict_ref (src_local->xdata_rsp); } void @@ -106,26 +190,9 @@ afr_writev_unwind (call_frame_t *frame, xlator_t *this) local->op_ret, local->op_errno, &local->cont.inode_wfop.prebuf, &local->cont.inode_wfop.postbuf, - NULL); + local->xdata_rsp); } -call_frame_t* -afr_transaction_detach_fop_frame (call_frame_t *frame) -{ - afr_local_t * local = NULL; - call_frame_t *fop_frame = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - fop_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - return fop_frame; -} int afr_transaction_writev_unwind (call_frame_t *frame, xlator_t *this) @@ -173,82 +240,60 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = NULL; - afr_private_t *priv = NULL; call_frame_t *fop_frame = NULL; int child_index = (long) cookie; int call_count = -1; - int read_child = 0; - int ret = 0; + int ret = 0; uint32_t open_fd_count = 0; uint32_t write_is_append = 0; local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); LOCK (&frame->lock); { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, prebuf, postbuf, - xdata); - - local->replies[child_index].valid = 1; - local->replies[child_index].op_ret = op_ret; - local->replies[child_index].op_errno = op_errno; - - - /* stage the best case return value for unwind */ - if ((local->success_count == 0) || (op_ret > local->op_ret)) { - local->op_ret = op_ret; - local->op_errno = op_errno; - } - - if (op_ret != -1) { - if (xdata) { - ret = dict_get_uint32 (xdata, - GLUSTERFS_OPEN_FD_COUNT, - &open_fd_count); - if ((ret == 0) && - (open_fd_count > local->open_fd_count)) { - local->open_fd_count = open_fd_count; - local->update_open_fd_count = _gf_true; - } - - write_is_append = 0; - ret = dict_get_uint32 (xdata, - GLUSTERFS_WRITE_IS_APPEND, - &write_is_append); - if (ret || !write_is_append) - local->append_write = _gf_false; - } - + __afr_inode_write_fill (frame, this, child_index, op_ret, + op_errno, prebuf, postbuf, xdata); + if (op_ret == -1 || !xdata) + goto unlock; + + write_is_append = 0; + ret = dict_get_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND, + &write_is_append); + if (ret || !write_is_append) + local->append_write = _gf_false; + + ret = dict_get_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, + &open_fd_count); + if (ret == -1) + goto unlock; + if ((open_fd_count > local->open_fd_count)) { + local->open_fd_count = open_fd_count; + local->update_open_fd_count = _gf_true; } } +unlock: UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) { - - if (local->update_open_fd_count) - afr_handle_open_fd_count (frame, this); - - if (!local->stable_write && !local->append_write) + if (!local->stable_write && !local->append_write) /* An appended write removes the necessity to fsync() the file. This is because self-heal has the logic to check for larger file when the xattrs are not reliably pointing at a stale file. */ - afr_fd_report_unstable_write (this, local->fd); + afr_fd_report_unstable_write (this, local->fd); + + __afr_inode_write_finalize (frame, this); afr_writev_handle_short_writes (frame, this); - if (afr_any_fops_failed (local, priv)) { + + if (local->update_open_fd_count) + afr_handle_open_fd_count (frame, this); + + if (!afr_txn_nothing_failed (frame, this)) { //Don't unwind until post-op is complete local->transaction.resume (frame, this); } else { @@ -272,91 +317,23 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, return 0; } + int -afr_writev_wind (call_frame_t *frame, xlator_t *this) +afr_writev_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int i = 0; - int call_count = -1; - dict_t *xdata = NULL; - GF_UNUSED int ret = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), - gf_afr_mt_reply_t); - if (!local->replies) { - local->op_ret = -1; - local->op_errno = ENOMEM; - local->transaction.unwind(frame, this); - local->transaction.resume(frame, this); - return 0; - } - - xdata = dict_new (); - if (xdata) { - ret = dict_set_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, - sizeof (uint32_t)); - ret = dict_set_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND, - 0); - /* Set append_write to be true speculatively. If on any - server it turns not be true, we unset it in the - callback. - */ - local->append_write = _gf_true; - } - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->writev, - local->fd, - local->cont.writev.vector, - local->cont.writev.count, - local->cont.writev.offset, - local->cont.writev.flags, - local->cont.writev.iobref, - xdata); - - if (!--call_count) - break; - } - } - - if (xdata) - dict_unref (xdata); - - return 0; -} - - -int -afr_writev_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - iobref_unref (local->cont.writev.iobref); - local->cont.writev.iobref = NULL; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->writev, + local->fd, local->cont.writev.vector, + local->cont.writev.count, local->cont.writev.offset, + local->cont.writev.flags, local->cont.writev.iobref, + local->xdata_req); return 0; } @@ -366,29 +343,29 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) { call_frame_t *transaction_frame = NULL; afr_local_t *local = NULL; - int op_ret = -1; - int op_errno = 0; - - local = frame->local; + int ret = -1; + int op_errno = ENOMEM; transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } + local = frame->local; transaction_frame->local = local; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + frame->local = NULL; - local->op = GF_FOP_WRITE; + if (!AFR_FRAME_INIT (frame, op_errno)) + goto out; - local->success_count = 0; + local->op = GF_FOP_WRITE; - local->transaction.fop = afr_writev_wind; - local->transaction.done = afr_writev_done; + local->transaction.wind = afr_writev_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_transaction_writev_unwind; local->transaction.main_frame = frame; + if (local->fd->flags & O_APPEND) { /* * Backend vfs ignores the 'offset' for append mode fd so @@ -405,179 +382,86 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) local->cont.writev.count); } - op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; goto out; } - op_ret = 0; + return 0; out: - if (op_ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); return 0; } -static void -afr_trigger_open_fd_self_heal (fd_t *fd, xlator_t *this) -{ - call_frame_t *frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - char *reason = NULL; - int32_t op_errno = 0; - int ret = 0; - - if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid args: " - "fd: %p, inode: %p", fd, - fd ? fd->inode : NULL); - goto out; - } - - frame = create_frame (this, this->ctx->pool); - if (!frame) - goto out; - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - ret = afr_local_init (local, this->private, &op_errno); - if (ret < 0) - goto out; - - local->loc.inode = inode_ref (fd->inode); - ret = loc_path (&local->loc, NULL); - if (ret < 0) - goto out; - - sh = &local->self_heal; - sh->do_metadata_self_heal = _gf_true; - if (fd->inode->ia_type == IA_IFREG) - sh->do_data_self_heal = _gf_true; - else if (fd->inode->ia_type == IA_IFDIR) - sh->do_entry_self_heal = _gf_true; - - reason = "subvolume came online"; - afr_launch_self_heal (frame, this, fd->inode, _gf_true, - fd->inode->ia_type, reason, NULL, NULL); - return; -out: - AFR_STACK_DESTROY (frame); -} - -void -afr_open_fd_fix (fd_t *fd, xlator_t *this) -{ - int ret = 0; - int i = 0; - afr_fd_ctx_t *fd_ctx = NULL; - gf_boolean_t need_self_heal = _gf_false; - int *need_open = NULL; - size_t need_open_count = 0; - afr_private_t *priv = NULL; - - priv = this->private; - - if (!afr_is_fd_fixable (fd)) - goto out; - - fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) - goto out; - - LOCK (&fd->lock); - { - if (fd_ctx->up_count < priv->up_count) { - need_self_heal = _gf_true; - fd_ctx->up_count = priv->up_count; - fd_ctx->down_count = priv->down_count; - } - - need_open = alloca (priv->child_count * sizeof (*need_open)); - for (i = 0; i < priv->child_count; i++) { - need_open[i] = 0; - if (fd_ctx->opened_on[i] != AFR_FD_NOT_OPENED) - continue; - - if (!priv->child_up[i]) - continue; - - fd_ctx->opened_on[i] = AFR_FD_OPENING; - - need_open[i] = 1; - need_open_count++; - } - } - UNLOCK (&fd->lock); - if (ret) - goto out; - - if (need_self_heal) - afr_trigger_open_fd_self_heal (fd, this); - - if (!need_open_count) - goto out; - - afr_fix_open (this, fd, need_open_count, need_open); -out: - return; -} int afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int op_errno = ENOMEM; priv = this->private; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } - QUORUM_CHECK(writev,out); - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - local->cont.writev.vector = iov_dup (vector, count); + local->cont.writev.vector = iov_dup (vector, count); + if (!local->cont.writev.vector) + goto out; local->cont.writev.count = count; local->cont.writev.offset = offset; local->cont.writev.flags = flags; local->cont.writev.iobref = iobref_ref (iobref); - local->fd = fd_ref (fd); + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; + + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); + + if (dict_set_uint32 (local->xdata_req, GLUSTERFS_OPEN_FD_COUNT, 4)) { + op_errno = ENOMEM; + goto out; + } + + if (dict_set_uint32 (local->xdata_req, GLUSTERFS_WRITE_IS_APPEND, 4)) { + op_errno = ENOMEM; + goto out; + } + + /* Set append_write to be true speculatively. If on any + server it turns not be true, we unset it in the + callback. + */ + local->append_write = _gf_true; /* detect here, but set it in writev_wind_cbk *after* the unstable write is performed */ local->stable_write = !!((fd->flags|flags)&(O_SYNC|O_DSYNC)); - afr_open_fd_fix (fd, this); + afr_fix_open (fd, this); afr_do_writev (frame, this); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); + AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -595,22 +479,13 @@ afr_truncate_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } @@ -620,96 +495,32 @@ afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - int child_index = (long) cookie; - int read_child = 0; - int call_count = -1; - - local = frame->local; - - read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); + afr_local_t *local = NULL; - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } + local = frame->local; - if (op_ret != -1) { - if (prebuf->ia_size != postbuf->ia_size) - local->stable_write = _gf_false; - } - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, prebuf, postbuf, - xdata); - } - UNLOCK (&frame->lock); + if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (local->stable_write && afr_txn_nothing_failed (frame, this)) - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); } -int32_t -afr_truncate_wind (call_frame_t *frame, xlator_t *this) +int +afr_truncate_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - local->stable_write = _gf_true; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->truncate, - &local->loc, - local->cont.truncate.offset, - NULL); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_truncate_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->truncate, + &local->loc, local->cont.truncate.offset, + local->xdata_req); return 0; } @@ -721,56 +532,60 @@ afr_truncate (call_frame_t *frame, xlator_t *this, afr_private_t * priv = NULL; afr_local_t * local = NULL; call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int ret = -1; + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(truncate,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; local->cont.truncate.offset = offset; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; - local->transaction.fop = afr_truncate_wind; - local->transaction.done = afr_truncate_done; + local->transaction.wind = afr_truncate_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_truncate_unwind; loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + + local->op = GF_FOP_TRUNCATE; local->transaction.main_frame = frame; local->transaction.start = offset; local->transaction.len = 0; + /* Set it true speculatively, will get reset in afr_truncate_wind_cbk + if truncate was not a NOP */ + local->stable_write = _gf_true; + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -788,21 +603,13 @@ afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } @@ -812,122 +619,75 @@ afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - int child_index = (long) cookie; - int call_count = -1; - int read_child = 0; - - local = frame->local; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - if (op_ret != -1) { - if (prebuf->ia_size != postbuf->ia_size) - local->stable_write = _gf_false; - } - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, prebuf, postbuf, - xdata); - } - UNLOCK (&frame->lock); + afr_local_t *local = NULL; - call_count = afr_frame_return (frame); + local = frame->local; - if (call_count == 0) { - if (local->stable_write && afr_txn_nothing_failed (frame, this)) - local->transaction.unwind (frame, this); + if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); } int -afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) +afr_ftruncate_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - local->call_count = call_count; - local->stable_write = _gf_true; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->ftruncate, - local->fd, - local->cont.ftruncate.offset, - NULL); - - if (!--call_count) - break; - } - } + local = frame->local; + priv = this->private; + STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->ftruncate, + local->fd, local->cont.ftruncate.offset, + local->xdata_req); return 0; } int -afr_ftruncate_done (call_frame_t *frame, xlator_t *this) +afr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { + afr_private_t *priv = NULL; afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; - - local->transaction.unwind (frame, this); + priv = this->private; - AFR_STACK_DESTROY (frame); + QUORUM_CHECK(ftruncate,out); - return 0; -} + transaction_frame = copy_frame (frame); + if (!frame) + goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; -int -afr_do_ftruncate (call_frame_t *frame, xlator_t *this) -{ - call_frame_t * transaction_frame = NULL; - afr_local_t * local = NULL; - int op_ret = -1; - int op_errno = 0; + local->cont.ftruncate.offset = offset; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local = frame->local; + if (!local->xdata_req) + goto out; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } - - transaction_frame->local = local; - frame->local = NULL; + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); local->op = GF_FOP_FTRUNCATE; - local->transaction.fop = afr_ftruncate_wind; - local->transaction.done = afr_ftruncate_done; + local->transaction.wind = afr_ftruncate_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_ftruncate_unwind; local->transaction.main_frame = frame; @@ -935,69 +695,21 @@ afr_do_ftruncate (call_frame_t *frame, xlator_t *this) local->transaction.start = local->cont.ftruncate.offset; local->transaction.len = 0; - op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } + afr_fix_open (fd, this); - op_ret = 0; -out: - if (op_ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, - NULL, NULL); - } - - return 0; -} - - -int -afr_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; + /* Set it true speculatively, will get reset in afr_ftruncate_wind_cbk + if truncate was not a NOP */ + local->stable_write = _gf_true; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } - QUORUM_CHECK(ftruncate,out); - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - local->cont.ftruncate.offset = offset; - - local->fd = fd_ref (fd); - - afr_open_fd_fix (fd, this); - - afr_do_ftruncate (frame, this); - - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); - } + AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -1009,173 +721,92 @@ out: int afr_setattr_unwind (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + local->xdata_rsp); return 0; } int afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, + int op_ret, int op_errno, struct iatt *preop, struct iatt *postop, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; - int read_child = 0; - int call_count = -1; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, preop, postop, - xdata); - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + preop, postop, xdata); } -int32_t -afr_setattr_wind (call_frame_t *frame, xlator_t *this) +int +afr_setattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setattr, - &local->loc, - &local->cont.setattr.in_buf, - local->cont.setattr.valid, - NULL); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->setattr, + &local->loc, &local->cont.setattr.in_buf, + local->cont.setattr.valid, local->xdata_req); return 0; } int -afr_setattr_done (call_frame_t *frame, xlator_t *this) +afr_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf, + int32_t valid, dict_t *xdata) { + afr_private_t *priv = NULL; afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - - -int -afr_setattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(setattr,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; local->cont.setattr.in_buf = *buf; local->cont.setattr.valid = valid; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local->transaction.fop = afr_setattr_wind; - local->transaction.done = afr_setattr_done; + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_setattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_setattr_unwind; loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + + local->op = GF_FOP_SETATTR; local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; @@ -1183,18 +814,16 @@ afr_setattr (call_frame_t *frame, xlator_t *this, ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -1208,22 +837,13 @@ afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } @@ -1233,149 +853,72 @@ afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preop, struct iatt *postop, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; - int read_child = 0; - int call_count = -1; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, preop, postop, - xdata); - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + preop, postop, xdata); } -int32_t -afr_fsetattr_wind (call_frame_t *frame, xlator_t *this) +int +afr_fsetattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fsetattr, - local->fd, - &local->cont.fsetattr.in_buf, - local->cont.fsetattr.valid, - NULL); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fsetattr, + local->fd, &local->cont.fsetattr.in_buf, + local->cont.fsetattr.valid, local->xdata_req); return 0; } int -afr_fsetattr_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - -int afr_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } - QUORUM_CHECK(fsetattr,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; local->cont.fsetattr.in_buf = *buf; local->cont.fsetattr.valid = valid; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local->transaction.fop = afr_fsetattr_wind; - local->transaction.done = afr_fsetattr_done; + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_fsetattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_fsetattr_unwind; local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); + + local->op = GF_FOP_FSETATTR; - afr_open_fd_fix (fd, this); + afr_fix_open (fd, this); local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; @@ -1383,18 +926,16 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this, ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -1410,19 +951,12 @@ afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (setxattr, main_frame, - local->op_ret, local->op_errno, - NULL); - } + AFR_STACK_UNWIND (setxattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); return 0; } @@ -1431,95 +965,32 @@ int afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int need_unwind = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - __inode_write_fop_cbk (frame, child_index, -1, this, - &op_ret, &op_errno, NULL, NULL, - xdata); - if (local->success_count == priv->child_count) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + NULL, NULL, xdata); } int -afr_setxattr_wind (call_frame_t *frame, xlator_t *this) +afr_setxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, - local->cont.setxattr.dict, - local->cont.setxattr.flags, - NULL); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->setxattr, + &local->loc, local->cont.setxattr.dict, + local->cont.setxattr.flags, local->xdata_req); return 0; } int -afr_setxattr_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - -int -afr_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) +afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -1527,59 +998,60 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, int ret = -1; int op_errno = EINVAL; - VALIDATE_OR_GOTO (this, out); - GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, op_errno, out); GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, op_errno, out); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; QUORUM_CHECK(setxattr,out); + transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) goto out; local->cont.setxattr.dict = dict_ref (dict); local->cont.setxattr.flags = flags; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local->transaction.fop = afr_setxattr_wind; - local->transaction.done = afr_setxattr_done; + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_setxattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_setxattr_unwind; loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; local->transaction.len = 0; + local->op = GF_FOP_SETXATTR; + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); return 0; } @@ -1595,19 +1067,12 @@ afr_fsetxattr_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (fsetxattr, main_frame, - local->op_ret, local->op_errno, - NULL); - } + AFR_STACK_UNWIND (fsetxattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); return 0; } @@ -1616,94 +1081,30 @@ int afr_fsetxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int need_unwind = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - - __inode_write_fop_cbk (frame, child_index, -1, this, - &op_ret, &op_errno, NULL, NULL, - xdata); - if (local->success_count == priv->child_count) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + NULL, NULL, xdata); } int -afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this) +afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fsetxattr, - local->fd, - local->cont.fsetxattr.dict, - local->cont.fsetxattr.flags, - NULL); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fsetxattr, + local->fd, local->cont.fsetxattr.dict, + local->cont.fsetxattr.flags, local->xdata_req); return 0; } int -afr_fsetxattr_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - -int afr_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata) { @@ -1711,11 +1112,7 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this, afr_local_t *local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = EINVAL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, op_errno, out); @@ -1725,36 +1122,36 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this, priv = this->private; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } - QUORUM_CHECK(fsetxattr,out); - AFR_LOCAL_ALLOC_OR_GOTO (local, out); - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { + if (!transaction_frame) goto out; - } - - transaction_frame->local = local; - local->op_ret = -1; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; local->cont.fsetxattr.dict = dict_ref (dict); local->cont.fsetxattr.flags = flags; - local->transaction.fop = afr_fsetxattr_wind; - local->transaction.done = afr_fsetxattr_done; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_fsetxattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_fsetxattr_unwind; local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); + + local->op = GF_FOP_FSETXATTR; local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; @@ -1762,18 +1159,16 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this, ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); return 0; } @@ -1791,19 +1186,12 @@ afr_removexattr_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (removexattr, main_frame, - local->op_ret, local->op_errno, - NULL); - } + AFR_STACK_UNWIND (removexattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); return 0; } @@ -1812,88 +1200,25 @@ int afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int need_unwind = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - __inode_write_fop_cbk (frame, child_index, -1, this, - &op_ret, &op_errno, NULL, NULL, - xdata); - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + NULL, NULL, xdata); } -int32_t -afr_removexattr_wind (call_frame_t *frame, xlator_t *this) +int +afr_removexattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->removexattr, - &local->loc, - local->cont.removexattr.name, - NULL); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_removexattr_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); + local = frame->local; + priv = this->private; + STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->removexattr, + &local->loc, local->cont.removexattr.name, + local->xdata_req); return 0; } @@ -1906,9 +1231,7 @@ afr_removexattr (call_frame_t *frame, xlator_t *this, afr_local_t *local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (this, out); + int op_errno = ENOMEM; GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", name, op_errno, out); @@ -1916,34 +1239,37 @@ afr_removexattr (call_frame_t *frame, xlator_t *this, GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", name, op_errno, out); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - priv = this->private; QUORUM_CHECK(removexattr,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; local->cont.removexattr.name = gf_strdup (name); - local->transaction.fop = afr_removexattr_wind; - local->transaction.done = afr_removexattr_done; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_removexattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_removexattr_unwind; loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + + local->op = GF_FOP_REMOVEXATTR; local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; @@ -1951,18 +1277,16 @@ afr_removexattr (call_frame_t *frame, xlator_t *this, ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); return 0; } @@ -1975,19 +1299,12 @@ afr_fremovexattr_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (fremovexattr, main_frame, - local->op_ret, local->op_errno, - NULL); - } + AFR_STACK_UNWIND (fremovexattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); return 0; } @@ -1996,105 +1313,38 @@ int afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int need_unwind = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - __inode_write_fop_cbk (frame, child_index, -1, this, - &op_ret, &op_errno, NULL, NULL, - xdata); - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + NULL, NULL, xdata); } -int32_t -afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this) +int +afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fremovexattr, - local->fd, - local->cont.removexattr.name, - NULL); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fremovexattr, + local->fd, local->cont.removexattr.name, + local->xdata_req); return 0; } int -afr_fremovexattr_done (call_frame_t *frame, xlator_t *this) +afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) { - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - - -int -afr_fremovexattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, const char *name, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; int ret = -1; - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (this, out); + int op_errno = ENOMEM; GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", name, op_errno, out); @@ -2102,64 +1352,59 @@ afr_fremovexattr (call_frame_t *frame, xlator_t *this, GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", name, op_errno, out); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } + priv = this->private; QUORUM_CHECK(fremovexattr, out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (local, out); - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) { - op_errno = -ret; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) goto out; - } - - transaction_frame->local = local; - - local->op_ret = -1; local->cont.removexattr.name = gf_strdup (name); + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local->transaction.fop = afr_fremovexattr_wind; - local->transaction.done = afr_fremovexattr_done; + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_fremovexattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_fremovexattr_unwind; local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); + + local->op = GF_FOP_FREMOVEXATTR; local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; local->transaction.len = 0; - op_ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } - op_ret = 0; + return 0; out: - if (op_ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (fremovexattr, frame, -1, op_errno, NULL); return 0; } -static int + +int afr_fallocate_unwind (call_frame_t *frame, xlator_t *this) { afr_local_t * local = NULL; @@ -2167,147 +1412,88 @@ afr_fallocate_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } -static int + +int afr_fallocate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; - int call_count = -1; - int need_unwind = 0; - int read_child = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, prebuf, postbuf, - xdata); - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); } -static int -afr_fallocate_wind (call_frame_t *frame, xlator_t *this) + +int +afr_fallocate_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fallocate, - local->fd, - local->cont.fallocate.mode, - local->cont.fallocate.offset, - local->cont.fallocate.len, - NULL); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fallocate, + local->fd, local->cont.fallocate.mode, + local->cont.fallocate.offset, + local->cont.fallocate.len, local->xdata_req); return 0; } -static int -afr_fallocate_done (call_frame_t *frame, xlator_t *this) + +int +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) { + afr_private_t *priv = NULL; + call_frame_t *transaction_frame = NULL; afr_local_t *local = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; + priv = this->private; - local->transaction.unwind (frame, this); + QUORUM_CHECK(fallocate,out); - AFR_STACK_DESTROY (frame); + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - return 0; -} + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; -static int -afr_do_fallocate (call_frame_t *frame, xlator_t *this) -{ - call_frame_t * transaction_frame = NULL; - afr_local_t * local = NULL; - int op_ret = -1; - int op_errno = 0; + local->cont.fallocate.mode = mode; + local->cont.fallocate.offset = offset; + local->cont.fallocate.len = len; - local = frame->local; + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - transaction_frame->local = local; - frame->local = NULL; + if (!local->xdata_req) + goto out; local->op = GF_FOP_FALLOCATE; - local->transaction.fop = afr_fallocate_wind; - local->transaction.done = afr_fallocate_done; + local->transaction.wind = afr_fallocate_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_fallocate_unwind; local->transaction.main_frame = frame; @@ -2315,80 +1501,29 @@ afr_do_fallocate (call_frame_t *frame, xlator_t *this) local->transaction.start = local->cont.fallocate.offset; local->transaction.len = 0; - /* fallocate can modify the file size */ - op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } - - op_ret = 0; -out: - if (op_ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fallocate, frame, op_ret, op_errno, NULL, - NULL, NULL); - } - - return 0; -} - -int -afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, - off_t offset, size_t len, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_fix_open (fd, this); - priv = this->private; - - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } - QUORUM_CHECK(fallocate,out); - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - local->cont.fallocate.mode = mode; - local->cont.fallocate.offset = offset; - local->cont.fallocate.len = len; - - local->fd = fd_ref (fd); - - afr_open_fd_fix (fd, this); - - afr_do_fallocate (frame, this); - - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); return 0; } + /* }}} */ /* {{{ discard */ -static int +int afr_discard_unwind (call_frame_t *frame, xlator_t *this) { afr_local_t * local = NULL; @@ -2396,146 +1531,86 @@ afr_discard_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (discard, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } -static int + +int afr_discard_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; - int call_count = -1; - int need_unwind = 0; - int read_child = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, prebuf, postbuf, - xdata); - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); } -static int -afr_discard_wind (call_frame_t *frame, xlator_t *this) + +int +afr_discard_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->discard, - local->fd, - local->cont.discard.offset, - local->cont.discard.len, - NULL); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->discard, + local->fd, local->cont.discard.offset, + local->cont.discard.len, local->xdata_req); return 0; } -static int -afr_discard_done (call_frame_t *frame, xlator_t *this) + +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { + afr_private_t *priv = NULL; afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; + priv = this->private; - local->transaction.unwind (frame, this); + QUORUM_CHECK(discard, out); - AFR_STACK_DESTROY (frame); + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - return 0; -} + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; -static int -afr_do_discard (call_frame_t *frame, xlator_t *this) -{ - call_frame_t * transaction_frame = NULL; - afr_local_t * local = NULL; - int op_ret = -1; - int op_errno = 0; + local->cont.discard.offset = offset; + local->cont.discard.len = len; - local = frame->local; + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - transaction_frame->local = local; - frame->local = NULL; + if (!local->xdata_req) + goto out; local->op = GF_FOP_DISCARD; - local->transaction.fop = afr_discard_wind; - local->transaction.done = afr_discard_done; + local->transaction.wind = afr_discard_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_discard_unwind; local->transaction.main_frame = frame; @@ -2543,316 +1618,134 @@ afr_do_discard (call_frame_t *frame, xlator_t *this) local->transaction.start = local->cont.discard.offset; local->transaction.len = 0; - op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } + afr_fix_open (fd, this); - op_ret = 0; -out: - if (op_ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (discard, frame, op_ret, op_errno, NULL, - NULL, NULL); - } - - return 0; -} - -int -afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - size_t len, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } - QUORUM_CHECK(discard, out); - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - local->cont.discard.offset = offset; - local->cont.discard.len = len; - - local->fd = fd_ref (fd); - - afr_open_fd_fix (fd, this); - - afr_do_discard(frame, this); - - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); return 0; } /* {{{ zerofill */ -static int +int afr_zerofill_unwind (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (zerofill, main_frame, local->op_ret, - local->op_errno, - &local->cont.zerofill.prebuf, - &local->cont.zerofill.postbuf, - NULL); - } + AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } -static int -afr_zerofill_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int child_index = (long) cookie; - int call_count = -1; - int need_unwind = 0; - int read_child = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - if (afr_fop_failed (op_ret, op_errno)) { - afr_transaction_fop_failed (frame, this, child_index); - } - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.zerofill.prebuf = *prebuf; - local->cont.zerofill.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.zerofill.prebuf = *prebuf; - local->cont.zerofill.postbuf = *postbuf; - } - - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (need_unwind) { - local->transaction.unwind (frame, this); - } - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; +int +afr_zerofill_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); } -static int -afr_zerofill_wind (call_frame_t *frame, xlator_t *this) + +int +afr_zerofill_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->zerofill, - local->fd, - local->cont.zerofill.offset, - local->cont.zerofill.len, - NULL); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->zerofill, + local->fd, local->cont.zerofill.offset, + local->cont.zerofill.len, local->xdata_req); return 0; } -static int -afr_zerofill_done (call_frame_t *frame, xlator_t *this) +int +afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { + afr_private_t *priv = NULL; afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; + priv = this->private; - local->transaction.unwind (frame, this); + QUORUM_CHECK(discard, out); - AFR_STACK_DESTROY (frame); + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - return 0; -} + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; -static int -afr_do_zerofill(call_frame_t *frame, xlator_t *this) -{ - call_frame_t *transaction_frame = NULL; - afr_local_t *local = NULL; - int op_ret = -1; - int op_errno = 0; + local->cont.zerofill.offset = offset; + local->cont.zerofill.len = len; - local = frame->local; + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - transaction_frame->local = local; - frame->local = NULL; + if (!local->xdata_req) + goto out; local->op = GF_FOP_ZEROFILL; - local->transaction.fop = afr_zerofill_wind; - local->transaction.done = afr_zerofill_done; + local->transaction.wind = afr_zerofill_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_zerofill_unwind; local->transaction.main_frame = frame; - local->transaction.start = local->cont.zerofill.offset; - local->transaction.len = 0; - - op_ret = afr_transaction (transaction_frame, this, - AFR_DATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } - - op_ret = 0; -out: - if (op_ret < 0) { - if (transaction_frame) { - AFR_STACK_DESTROY (transaction_frame); - } - AFR_STACK_UNWIND (zerofill, frame, op_ret, op_errno, NULL, - NULL, NULL); - } - - return 0; -} - -int -afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - off_t len, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } - QUORUM_CHECK(zerofill, out); + local->transaction.start = local->cont.discard.offset; + local->transaction.len = len; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + afr_fix_open (fd, this); - ret = afr_local_init (local, priv, &op_errno); + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); if (ret < 0) { - goto out; + op_errno = -ret; + goto out; } - local->cont.zerofill.offset = offset; - local->cont.zerofill.len = len; - - local->fd = fd_ref (fd); - - afr_open_fd_fix (fd, this); - afr_do_zerofill(frame, this); - - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) { - AFR_STACK_DESTROY (transaction_frame); - } - AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, - NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL); return 0; } diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index 060d78f3505..a2a758f35af 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -580,22 +580,6 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this) return 0; } -loc_t * -lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2) -{ - int ret = 0; - - ret = uuid_compare (l1->inode->gfid, l2->inode->gfid); - - if (ret == 0) - ret = strcmp (b1, b2); - - if (ret <= 0) - return l1; - else - return l2; -} - int afr_lockee_locked_nodes_count (afr_internal_lock_t *int_lock) { @@ -1213,8 +1197,7 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this) case AFR_ENTRY_RENAME_TRANSACTION: case AFR_ENTRY_TRANSACTION: - up_count = afr_up_children_count (local->child_up, - priv->child_count); + up_count = AFR_COUNT (local->child_up, priv->child_count); int_lock->lk_call_count = int_lock->lk_expected_count = (int_lock->lockee_count * up_count); @@ -1648,496 +1631,6 @@ afr_unlock (call_frame_t *frame, xlator_t *this) } int -afr_mark_locked_nodes (xlator_t *this, fd_t *fd, - unsigned char *locked_nodes) -{ - afr_private_t *priv = NULL; - afr_fd_ctx_t *fdctx = NULL; - uint64_t tmp = 0; - int ret = 0; - - priv = this->private; - - ret = afr_fd_ctx_set (this, fd); - if (ret) - goto out; - - ret = fd_ctx_get (fd, this, &tmp); - if (ret) { - gf_log (this->name, GF_LOG_INFO, - "failed to get the fd ctx"); - goto out; - } - fdctx = (afr_fd_ctx_t *) (long) tmp; - - GF_ASSERT (fdctx->locked_on); - - memcpy (fdctx->locked_on, locked_nodes, - priv->child_count); - -out: - return ret; -} - -static int -__is_fd_saved (xlator_t *this, fd_t *fd) -{ - afr_locked_fd_t *locked_fd = NULL; - afr_private_t *priv = NULL; - int found = 0; - - priv = this->private; - - list_for_each_entry (locked_fd, &priv->saved_fds, list) { - if (locked_fd->fd == fd) { - found = 1; - break; - } - } - - return found; -} - -static int -__afr_save_locked_fd (xlator_t *this, fd_t *fd) -{ - afr_private_t *priv = NULL; - afr_locked_fd_t *locked_fd = NULL; - int ret = 0; - - priv = this->private; - - locked_fd = GF_CALLOC (1, sizeof (*locked_fd), - gf_afr_mt_locked_fd); - if (!locked_fd) { - ret = -1; - goto out; - } - - locked_fd->fd = fd; - INIT_LIST_HEAD (&locked_fd->list); - - list_add_tail (&locked_fd->list, &priv->saved_fds); - -out: - return ret; -} - -int -afr_save_locked_fd (xlator_t *this, fd_t *fd) -{ - afr_private_t *priv = NULL; - int ret = 0; - - priv = this->private; - - pthread_mutex_lock (&priv->mutex); - { - if (__is_fd_saved (this, fd)) { - gf_log (this->name, GF_LOG_DEBUG, - "fd=%p already saved", fd); - goto unlock; - } - - ret = __afr_save_locked_fd (this, fd); - if (ret) { - gf_log (this->name, GF_LOG_INFO, - "fd=%p could not be saved", fd); - goto unlock; - } - } -unlock: - pthread_mutex_unlock (&priv->mutex); - - return ret; -} - -static int -afr_lock_recovery_cleanup (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_locked_fd_t *locked_fd = NULL; - - local = frame->local; - - locked_fd = local->locked_fd; - - STACK_DESTROY (frame->root); - afr_local_cleanup (local, this); - - afr_save_locked_fd (this, locked_fd->fd); - - return 0; - -} - -static int -afr_get_source_lock_recovery (xlator_t *this, fd_t *fd) -{ - afr_fd_ctx_t *fdctx = NULL; - afr_private_t *priv = NULL; - uint64_t tmp = 0; - int i = 0; - int source_child = -1; - int ret = 0; - - priv = this->private; - - ret = fd_ctx_get (fd, this, &tmp); - if (ret) - goto out; - - fdctx = (afr_fd_ctx_t *) (long) tmp; - - for (i = 0; i < priv->child_count; i++) { - if (fdctx->locked_on[i]) { - gf_log (this->name, GF_LOG_DEBUG, - "Found lock recovery source=%d", i); - source_child = i; - break; - } - } - -out: - return source_child; - -} - -int32_t -afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock, - dict_t *xdata); -int32_t -afr_recover_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock, - dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int32_t source_child = 0; - struct gf_flock flock = {0,}; - - local = frame->local; - priv = this->private; - - if (op_ret) { - gf_log (this->name, GF_LOG_INFO, - "lock recovery failed"); - goto cleanup; - } - - source_child = local->source_child; - - memcpy (&flock, lock, sizeof (*lock)); - - STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk, - (void *) (long) source_child, - priv->children[source_child], - priv->children[source_child]->fops->lk, - local->fd, F_GETLK_FD, &flock, NULL); - - return 0; - -cleanup: - afr_lock_recovery_cleanup (frame, this); - return 0; -} - -int -afr_recover_lock (call_frame_t *frame, xlator_t *this, - struct gf_flock *flock) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int32_t lock_recovery_child = 0; - - priv = this->private; - local = frame->local; - - lock_recovery_child = local->lock_recovery_child; - - frame->root->lk_owner = flock->l_owner; - - STACK_WIND_COOKIE (frame, afr_recover_lock_cbk, - (void *) (long) lock_recovery_child, - priv->children[lock_recovery_child], - priv->children[lock_recovery_child]->fops->lk, - local->fd, F_SETLK, flock, NULL); - - return 0; -} - -static int -is_afr_lock_eol (struct gf_flock *lock) -{ - int ret = 0; - - if ((lock->l_type == GF_LK_EOL)) - ret = 1; - - return ret; -} - -int32_t -afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock, - dict_t *xdata) -{ - if (op_ret) { - gf_log (this->name, GF_LOG_INFO, - "Failed to get locks on fd"); - goto cleanup; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Got a lock on fd"); - - if (is_afr_lock_eol (lock)) { - gf_log (this->name, GF_LOG_INFO, - "Reached EOL on locks on fd"); - goto cleanup; - } - - afr_recover_lock (frame, this, lock); - - return 0; - -cleanup: - afr_lock_recovery_cleanup (frame, this); - - return 0; -} - -static int -afr_lock_recovery (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - fd_t *fd = NULL; - int ret = 0; - int32_t source_child = 0; - struct gf_flock flock = {0,}; - - priv = this->private; - local = frame->local; - - fd = local->fd; - - source_child = afr_get_source_lock_recovery (this, fd); - if (source_child < 0) { - gf_log (this->name, GF_LOG_ERROR, - "Could not recover locks due to lock " - "split brain"); - ret = -1; - goto out; - } - - local->source_child = source_child; - - /* the flock can be zero filled as we're querying incrementally - the locks held on the fd. - */ - STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk, - (void *) (long) source_child, - priv->children[source_child], - priv->children[source_child]->fops->lk, - local->fd, F_GETLK_FD, &flock, NULL); - -out: - return ret; -} - - -static int -afr_mark_fd_opened (xlator_t *this, fd_t *fd, int32_t child_index) -{ - afr_fd_ctx_t *fdctx = NULL; - uint64_t tmp = 0; - int ret = 0; - - ret = fd_ctx_get (fd, this, &tmp); - if (ret) - goto out; - - fdctx = (afr_fd_ctx_t *) (long) tmp; - - fdctx->opened_on[child_index] = AFR_FD_OPENED; - -out: - return ret; -} - -int32_t -afr_lock_recovery_preopen_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, - dict_t *xdata) -{ - int32_t child_index = (long )cookie; - int ret = 0; - - if (op_ret) { - gf_log (this->name, GF_LOG_INFO, - "Reopen during lock-recovery failed"); - goto cleanup; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Open succeeded => proceed to recover locks"); - - ret = afr_lock_recovery (frame, this); - if (ret) { - gf_log (this->name, GF_LOG_INFO, - "Lock recovery failed"); - goto cleanup; - } - - ret = afr_mark_fd_opened (this, fd, child_index); - if (ret) { - gf_log (this->name, GF_LOG_INFO, - "Marking fd open failed"); - goto cleanup; - } - - return 0; - -cleanup: - afr_lock_recovery_cleanup (frame, this); - return 0; -} - -static int -afr_lock_recovery_preopen (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - uint64_t tmp = 0; - afr_fd_ctx_t *fdctx = NULL; - loc_t loc = {0,}; - int32_t child_index = 0; - int ret = 0; - - priv = this->private; - local = frame->local; - - GF_ASSERT (local && local->fd); - - ret = fd_ctx_get (local->fd, this, &tmp); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to get the context of fd", - uuid_utoa (local->fd->inode->gfid)); - fdctx = (afr_fd_ctx_t *) (long) tmp; - /* TODO: instead we should return from the function */ - GF_ASSERT (fdctx); - - child_index = local->lock_recovery_child; - - inode_path (local->fd->inode, NULL, (char **)&loc.path); - loc.name = strrchr (loc.path, '/'); - loc.inode = inode_ref (local->fd->inode); - loc.parent = inode_parent (local->fd->inode, 0, NULL); - - - STACK_WIND_COOKIE (frame, afr_lock_recovery_preopen_cbk, - (void *)(long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->open, - &loc, fdctx->flags, local->fd, NULL); - - return 0; -} - -static int -is_fd_opened (fd_t *fd, int32_t child_index) -{ - afr_fd_ctx_t *fdctx = NULL; - uint64_t tmp = 0; - int ret = 0; - - ret = fd_ctx_get (fd, THIS, &tmp); - if (ret) - goto out; - - fdctx = (afr_fd_ctx_t *) (long) tmp; - - if (fdctx->opened_on[child_index] == AFR_FD_OPENED) - ret = 1; - -out: - return ret; -} - -int -afr_attempt_lock_recovery (xlator_t *this, int32_t child_index) -{ - call_frame_t *frame = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_locked_fd_t *locked_fd = NULL; - afr_locked_fd_t *tmp = NULL; - int ret = -1; - struct list_head locks_list = {0,}; - int32_t op_errno = 0; - - - priv = this->private; - - if (list_empty (&priv->saved_fds)) - goto out; - - frame = create_frame (this, this->ctx->pool); - if (!frame) { - ret = -1; - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) { - ret = -1; - goto out; - } - - frame->local = local; - - INIT_LIST_HEAD (&locks_list); - - pthread_mutex_lock (&priv->mutex); - { - list_splice_init (&priv->saved_fds, &locks_list); - } - pthread_mutex_unlock (&priv->mutex); - - list_for_each_entry_safe (locked_fd, tmp, - &locks_list, list) { - - list_del_init (&locked_fd->list); - - local->fd = fd_ref (locked_fd->fd); - local->lock_recovery_child = child_index; - local->locked_fd = locked_fd; - - if (!is_fd_opened (locked_fd->fd, child_index)) { - gf_log (this->name, GF_LOG_DEBUG, - "attempting open before lock " - "recovery"); - afr_lock_recovery_preopen (frame, this); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "attempting lock recovery " - "without a preopen"); - afr_lock_recovery (frame, this); - } - } - -out: - if ((ret < 0) && frame) - AFR_STACK_DESTROY (frame); - return ret; -} - -int afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, unsigned int child_count) { diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 73594f26526..05df90cc0ee 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -41,10 +41,8 @@ enum gf_afr_mem_types_ { gf_afr_mt_shd_event_t, gf_afr_mt_time_t, gf_afr_mt_pos_data_t, - gf_afr_mt_reply_t, - gf_afr_mt_stats_t, - gf_afr_mt_shd_crawl_event_t, - gf_afr_mt_uint64_t, + gf_afr_mt_reply_t, + gf_afr_mt_subvol_healer_t, gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index 643a5d692df..f86aa7fd80d 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -43,85 +43,29 @@ #include "afr-dir-read.h" #include "afr-dir-write.h" #include "afr-transaction.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" -int -afr_stale_child_up (afr_local_t *local, xlator_t *this) -{ - int i = 0; - afr_private_t *priv = NULL; - int up = -1; - - priv = this->private; - - if (!local->fresh_children) - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) - goto out; - - afr_inode_get_read_ctx (this, local->fd->inode, local->fresh_children); - if (priv->child_count == afr_get_children_count (local->fresh_children, - priv->child_count)) - goto out; - for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i]) - continue; - if (afr_is_child_present (local->fresh_children, - priv->child_count, i)) - continue; - up = i; - break; - } -out: - return up; -} - -void -afr_perform_data_self_heal (call_frame_t *frame, xlator_t *this) +gf_boolean_t +afr_is_fd_fixable (fd_t *fd) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - inode_t *inode = NULL; - int st_child = -1; - char reason[64] = {0}; - - local = frame->local; - sh = &local->self_heal; - inode = local->fd->inode; - - if (!IA_ISREG (inode->ia_type)) - goto out; - - st_child = afr_stale_child_up (local, this); - if (st_child < 0) - goto out; - - sh->do_data_self_heal = _gf_true; - sh->do_metadata_self_heal = _gf_true; - sh->do_gfid_self_heal = _gf_true; - sh->do_missing_entry_self_heal = _gf_true; - - snprintf (reason, sizeof (reason), "stale subvolume %d detected", - st_child); - afr_launch_self_heal (frame, this, inode, _gf_true, inode->ia_type, - reason, NULL, NULL); -out: - return; + if (!fd || !fd->inode) + return _gf_false; + else if (fd_is_anonymous (fd)) + return _gf_false; + else if (uuid_is_null (fd->inode->gfid)) + return _gf_false; + + return _gf_true; } + int afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = frame->local; - afr_private_t *priv = NULL; - priv = this->private; - if (afr_open_only_data_self_heal (priv->data_self_heal)) - afr_perform_data_self_heal (frame, this); AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, local->fd, xdata); return 0; @@ -134,49 +78,38 @@ afr_open_cbk (call_frame_t *frame, void *cookie, fd_t *fd, dict_t *xdata) { afr_local_t * local = NULL; - int ret = 0; int call_count = -1; int child_index = (long) cookie; - afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; - priv = this->private; local = frame->local; + fd_ctx = local->fd_ctx; LOCK (&frame->lock); { if (op_ret == -1) { local->op_errno = op_errno; - } - - if (op_ret >= 0) { + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } else { local->op_ret = op_ret; - local->success_count++; - - ret = afr_child_fd_ctx_set (this, fd, child_index, - local->cont.open.flags); - if (ret) { - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); } } -unlock: UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) { - if ((local->cont.open.flags & O_TRUNC) - && (local->op_ret >= 0)) { + if ((fd_ctx->flags & O_TRUNC) && (local->op_ret >= 0)) { STACK_WIND (frame, afr_open_ftruncate_cbk, this, this->fops->ftruncate, fd, 0, NULL); } else { - if (afr_open_only_data_self_heal (priv->data_self_heal)) - afr_perform_data_self_heal (frame, this); AFR_STACK_UNWIND (open, frame, local->op_ret, - local->op_errno, local->fd, xdata); + local->op_errno, local->fd, + local->xdata_rsp); } } @@ -190,16 +123,11 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, afr_private_t * priv = NULL; afr_local_t * local = NULL; int i = 0; - int ret = -1; int32_t call_count = 0; int32_t op_errno = 0; - int32_t wind_flags = flags & (~O_TRUNC); - //We can't let truncation to happen outside transaction. + afr_fd_ctx_t *fd_ctx = NULL; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); + //We can't let truncation to happen outside transaction. priv = this->private; @@ -207,44 +135,38 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, QUORUM_CHECK(open,out); } - if (afr_is_split_brain (this, loc->inode)) { - /* self-heal failed */ - gf_log (this->name, GF_LOG_WARNING, - "failed to open as split brain seen, returning EIO"); - op_errno = EIO; - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) { + op_errno = ENOMEM; + goto out; + } - call_count = local->call_count; - loc_copy (&local->loc, loc); + local->fd = fd_ref (fd); + local->fd_ctx = fd_ctx; + fd_ctx->flags = flags; - local->cont.open.flags = flags; + call_count = local->call_count; - local->fd = fd_ref (fd); + local->cont.open.flags = flags; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->open, - loc, wind_flags, fd, xdata); - + loc, (flags & ~O_TRUNC), fd, xdata); if (!--call_count) break; } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, xdata); + AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, NULL); return 0; } @@ -273,12 +195,7 @@ afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, priv->children[child_index]->name); } - fd_ctx = afr_fd_ctx_get (local->fd, this); - if (!fd_ctx) { - gf_log (this->name, GF_LOG_WARNING, - "failed to get fd context, %p", local->fd); - goto out; - } + fd_ctx = local->fd_ctx; LOCK (&local->fd->lock); { @@ -289,7 +206,7 @@ afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } } UNLOCK (&local->fd->lock); -out: + call_count = afr_frame_return (frame); if (call_count == 0) AFR_STACK_DESTROY (frame); @@ -297,8 +214,42 @@ out: return 0; } + +static int +afr_fd_ctx_need_open (fd_t *fd, xlator_t *this, unsigned char *need_open) +{ + afr_fd_ctx_t *fd_ctx = NULL; + afr_private_t *priv = NULL; + int i = 0; + int count = 0; + + priv = this->private; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return 0; + + LOCK (&fd->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED && + priv->child_up[i]) { + fd_ctx->opened_on[i] = AFR_FD_OPENING; + need_open[i] = 1; + count++; + } else { + need_open[i] = 0; + } + } + } + UNLOCK (&fd->lock); + + return count; +} + + void -afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open) +afr_fix_open (fd_t *fd, xlator_t *this) { afr_private_t *priv = NULL; int i = 0; @@ -307,29 +258,31 @@ afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open) int ret = -1; int32_t op_errno = 0; afr_fd_ctx_t *fd_ctx = NULL; + unsigned char *need_open = NULL; + int call_count = 0; priv = this->private; - if (!afr_is_fd_fixable (fd) || !need_open || !need_open_count) + if (!afr_is_fd_fixable (fd)) goto out; fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) { - ret = -1; + if (!fd_ctx) goto out; - } + + need_open = alloca0 (priv->child_count); + + call_count = afr_fd_ctx_need_open (fd, this, need_open); + if (!call_count) + goto out; frame = create_frame (this, this->ctx->pool); - if (!frame) { - ret = -1; + if (!frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; local->loc.inode = inode_ref (fd->inode); ret = loc_path (&local->loc, NULL); @@ -337,10 +290,12 @@ afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open) goto out; local->fd = fd_ref (fd); - local->call_count = need_open_count; + local->fd_ctx = fd_ctx; + + local->call_count = call_count; - gf_log (this->name, GF_LOG_DEBUG, "need open count: %zd", - need_open_count); + gf_log (this->name, GF_LOG_DEBUG, "need open count: %d", + call_count); for (i = 0; i < priv->child_count; i++) { if (!need_open[i]) @@ -371,12 +326,12 @@ afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open) local->fd, NULL); } + if (!--call_count) + break; } - op_errno = 0; - ret = 0; + + return; out: - if (op_errno) - ret = -1; //For handling ALLOC_OR_GOTO - if (ret && frame) + if (frame) AFR_STACK_DESTROY (frame); } diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c new file mode 100644 index 00000000000..186f68c3359 --- /dev/null +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -0,0 +1,239 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "afr.h" +#include "afr-transaction.h" + +int +afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int subvol = -1; + + local = frame->local; + priv = this->private; + + + for (i = 0; i < priv->child_count; i++) { + if (!local->readable[i]) { + /* don't even bother trying here. + just mark as attempted and move on. */ + local->read_attempted[i] = 1; + continue; + } + + if (!local->read_attempted[i]) { + subvol = i; + break; + } + } + + /* If no more subvols were available for reading, we leave + @subvol as -1, which is an indication we have run out of + readable subvols. */ + if (subvol != -1) + local->read_attempted[subvol] = 1; + local->readfn (frame, this, subvol); + + return 0; +} + + +int +afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err) +{ + afr_local_t *local = NULL; + int read_subvol = 0; + int event_generation = 0; + inode_t *inode = NULL; + int ret = -1; + + local = frame->local; + inode = local->inode; + + if (err) { + local->op_errno = -err; + local->op_ret = -1; + read_subvol = -1; + goto readfn; + } + + ret = afr_inode_read_subvol_type_get (inode, this, local->readable, + &event_generation, + local->transaction.type); + + if (ret == -1 || !event_generation) { + /* Even after refresh, we don't have a good + read subvolume. Time to bail */ + local->op_ret = -1; + local->op_errno = EIO; + read_subvol = -1; + goto readfn; + } + + read_subvol = afr_read_subvol_select_by_policy (inode, this, + local->readable); + + if (read_subvol == -1) { + local->op_ret = -1; + local->op_errno = EIO; + goto readfn; + } + + if (local->read_attempted[read_subvol]) { + afr_read_txn_next_subvol (frame, this); + return 0; + } + + local->read_attempted[read_subvol] = 1; +readfn: + local->readfn (frame, this, read_subvol); + + return 0; +} + + +int +afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (!local->refreshed) { + local->refreshed = _gf_true; + afr_inode_refresh (frame, this, local->inode, + afr_read_txn_refresh_done); + } else { + afr_read_txn_next_subvol (frame, this); + } + + return 0; +} + + +/* afr_read_txn_wipe: + + clean internal variables in @local in order to make + it possible to call afr_read_txn() multiple times from + the same frame +*/ + +void +afr_read_txn_wipe (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + + local->readfn = NULL; + + if (local->inode) + inode_unref (local->inode); + + for (i = 0; i < priv->child_count; i++) { + local->read_attempted[i] = 0; + local->readable[i] = 0; + } +} + + +/* + afr_read_txn: + + This is the read transaction function. The way it works: + + - Determine read-subvolume from inode ctx. + + - If read-subvolume's generation was stale, refresh ctx once by + calling afr_inode_refresh() + + Else make an attempt to read on read-subvolume. + + - If attempted read on read-subvolume fails, refresh ctx once + by calling afr_inode_refresh() + + - After ctx refresh, query read-subvolume freshly and attempt + read once. + + - If read fails, try every other readable[] subvolume before + finally giving up. readable[] elements are set by afr_inode_refresh() + based on dirty and pending flags. + + - If file is in split brain in the backend, generation will be + kept 0 by afr_inode_refresh() and readable[] will be set 0 for + all elements. Therefore reads always fail. +*/ + +int +afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_read_txn_wind_t readfn, afr_transaction_type type) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int read_subvol = -1; + int event_generation = 0; + int ret = -1; + + priv = this->private; + local = frame->local; + + afr_read_txn_wipe (frame, this); + + local->readfn = readfn; + local->inode = inode_ref (inode); + + local->transaction.type = type; + ret = afr_inode_read_subvol_type_get (inode, this, local->readable, + &event_generation, type); + if (ret == -1) + /* very first transaction on this inode */ + goto refresh; + + if (local->event_generation != event_generation) + /* servers have disconnected / reconnected, and possibly + rebooted, very likely changing the state of freshness + of copies */ + goto refresh; + + read_subvol = afr_read_subvol_select_by_policy (inode, this, + local->readable); + + if (read_subvol < 0 || read_subvol > priv->child_count) { + gf_log (this->name, GF_LOG_WARNING, "Unreadable subvolume %d " + "found with event generation %d", read_subvol, + event_generation); + goto refresh; + } + + if (!local->child_up[read_subvol]) { + /* should never happen, just in case */ + gf_log (this->name, GF_LOG_WARNING, "subvolume %d is the " + "read subvolume in this generation, but is not up", + read_subvol); + goto refresh; + } + + local->read_attempted[read_subvol] = 1; + + local->readfn (frame, this, read_subvol); + + return 0; + +refresh: + afr_inode_refresh (frame, this, inode, afr_read_txn_refresh_done); + + return 0; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c deleted file mode 100644 index 83846f152d2..00000000000 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c +++ /dev/null @@ -1,837 +0,0 @@ -/* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - - -#include <openssl/md5.h> -#include "glusterfs.h" -#include "afr.h" -#include "xlator.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" -#include "afr-self-heal-algorithm.h" - -/* - This file contains the various self-heal algorithms -*/ - -static int -sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, - gf_boolean_t is_first_call, call_frame_t *old_loop_frame); -static int -sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame, - int32_t op_ret, int32_t op_errno); -static int -sh_destroy_frame (call_frame_t *frame, xlator_t *this) -{ - if (!frame) - goto out; - - AFR_STACK_DESTROY (frame); -out: - return 0; -} - -static void -sh_private_cleanup (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_private_t *sh_priv = NULL; - - local = frame->local; - sh = &local->self_heal; - - sh_priv = sh->private; - GF_FREE (sh_priv); -} - -static int -sh_number_of_writes_needed (unsigned char *write_needed, int child_count) -{ - int writes = 0; - int i = 0; - - for (i = 0; i < child_count; i++) { - if (write_needed[i]) - writes++; - } - - return writes; -} - - -static int -sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this, - call_frame_t *last_loop_frame) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_private_t *sh_priv = NULL; - int32_t total_blocks = 0; - int32_t diff_blocks = 0; - - local = sh_frame->local; - sh = &local->self_heal; - sh_priv = sh->private; - if (sh_priv) { - total_blocks = sh_priv->total_blocks; - diff_blocks = sh_priv->diff_blocks; - } - - sh_private_cleanup (sh_frame, this); - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - GF_ASSERT (!last_loop_frame); - //loop_finish should have happened and the old_loop should be NULL - gf_log (this->name, GF_LOG_DEBUG, - "self-heal aborting on %s", - local->loc.path); - - local->self_heal.algo_abort_cbk (sh_frame, this); - } else { - GF_ASSERT (last_loop_frame); - if (diff_blocks == total_blocks) { - gf_log (this->name, GF_LOG_DEBUG, "full self-heal " - "completed on %s",local->loc.path); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "diff self-heal on %s: completed. " - "(%d blocks of %d were different (%.2f%%))", - local->loc.path, diff_blocks, total_blocks, - ((diff_blocks * 1.0)/total_blocks) * 100); - } - - sh->old_loop_frame = last_loop_frame; - local->self_heal.algo_completion_cbk (sh_frame, this); - } - - return 0; -} - -int -sh_loop_finish (call_frame_t *loop_frame, xlator_t *this) -{ - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - - if (!loop_frame) - goto out; - - loop_local = loop_frame->local; - if (loop_local) { - loop_sh = &loop_local->self_heal; - } - - if (loop_sh && loop_sh->data_lock_held) { - afr_sh_data_unlock (loop_frame, this, this->name, - sh_destroy_frame); - } else { - sh_destroy_frame (loop_frame, this); - } -out: - return 0; -} - -static int -sh_loop_lock_success (call_frame_t *loop_frame, xlator_t *this) -{ - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - sh_loop_finish (loop_sh->old_loop_frame, this); - loop_sh->old_loop_frame = NULL; - - gf_log (this->name, GF_LOG_DEBUG, "Acquired lock for range %"PRIu64 - " %"PRIu64, loop_sh->offset, loop_sh->block_size); - loop_sh->data_lock_held = _gf_true; - loop_sh->sh_data_algo_start (loop_frame, this); - return 0; -} - -static int -sh_loop_lock_failure (call_frame_t *loop_frame, xlator_t *this) -{ - call_frame_t *sh_frame = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - sh_frame = loop_sh->sh_frame; - - gf_log (this->name, GF_LOG_ERROR, "failed lock for range %"PRIu64 - " %"PRIu64, loop_sh->offset, loop_sh->block_size); - sh_loop_finish (loop_sh->old_loop_frame, this); - loop_sh->old_loop_frame = NULL; - sh_loop_return (sh_frame, this, loop_frame, -1, ENOTCONN); - return 0; -} - -static int -sh_loop_frame_create (call_frame_t *sh_frame, xlator_t *this, - call_frame_t *old_loop_frame, call_frame_t **loop_frame) -{ - call_frame_t *new_loop_frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_local_t *new_loop_local = NULL; - afr_self_heal_t *new_loop_sh = NULL; - afr_private_t *priv = NULL; - - GF_ASSERT (sh_frame); - GF_ASSERT (loop_frame); - - *loop_frame = NULL; - local = sh_frame->local; - sh = &local->self_heal; - priv = this->private; - - new_loop_frame = copy_frame (sh_frame); - if (!new_loop_frame) - goto out; - //We want the frame to have same lk_owner as sh_frame - //so that locks translator allows conflicting locks - new_loop_local = afr_self_heal_local_init (local, this); - if (!new_loop_local) - goto out; - new_loop_frame->local = new_loop_local; - - new_loop_sh = &new_loop_local->self_heal; - new_loop_sh->sources = memdup (sh->sources, - priv->child_count * sizeof (*sh->sources)); - if (!new_loop_sh->sources) - goto out; - new_loop_sh->write_needed = GF_CALLOC (priv->child_count, - sizeof (*new_loop_sh->write_needed), - gf_afr_mt_char); - if (!new_loop_sh->write_needed) - goto out; - new_loop_sh->checksum = GF_CALLOC (priv->child_count, MD5_DIGEST_LENGTH, - gf_afr_mt_uint8_t); - if (!new_loop_sh->checksum) - goto out; - new_loop_sh->inode = inode_ref (sh->inode); - new_loop_sh->sh_data_algo_start = sh->sh_data_algo_start; - new_loop_sh->source = sh->source; - new_loop_sh->active_sinks = sh->active_sinks; - new_loop_sh->healing_fd = fd_ref (sh->healing_fd); - new_loop_sh->file_has_holes = sh->file_has_holes; - new_loop_sh->old_loop_frame = old_loop_frame; - new_loop_sh->sh_frame = sh_frame; - *loop_frame = new_loop_frame; - return 0; -out: - sh_destroy_frame (new_loop_frame, this); - return -ENOMEM; -} - -static int -sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, - call_frame_t *old_loop_frame) -{ - call_frame_t *new_loop_frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_local_t *new_loop_local = NULL; - afr_self_heal_t *new_loop_sh = NULL; - int ret = 0; - - GF_ASSERT (sh_frame); - - local = sh_frame->local; - sh = &local->self_heal; - - ret = sh_loop_frame_create (sh_frame, this, old_loop_frame, - &new_loop_frame); - if (ret) - goto out; - new_loop_local = new_loop_frame->local; - new_loop_sh = &new_loop_local->self_heal; - new_loop_sh->offset = offset; - new_loop_sh->block_size = sh->block_size; - afr_sh_data_lock (new_loop_frame, this, offset, new_loop_sh->block_size, - _gf_true, this->name, sh_loop_lock_success, sh_loop_lock_failure); - return 0; -out: - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - if (old_loop_frame) - sh_loop_finish (old_loop_frame, this); - sh_loop_return (sh_frame, this, new_loop_frame, -1, ENOMEM); - return 0; -} - -static int -sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, - gf_boolean_t is_first_call, call_frame_t *old_loop_frame) -{ - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_private_t *sh_priv = NULL; - gf_boolean_t is_driver_done = _gf_false; - blksize_t block_size = 0; - int loop = 0; - off_t offset = 0; - afr_private_t *priv = NULL; - - priv = this->private; - local = sh_frame->local; - sh = &local->self_heal; - sh_priv = sh->private; - - LOCK (&sh_priv->lock); - { - if (!is_first_call) - sh_priv->loops_running--; - offset = sh_priv->offset; - block_size = sh->block_size; - while ((!sh->eof_reached) && - (!is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) && - (sh_priv->loops_running < priv->data_self_heal_window_size) - && (sh_priv->offset < sh->file_size)) { - - loop++; - sh_priv->offset += block_size; - sh_priv->loops_running++; - - if (!is_first_call) - break; - } - if (0 == sh_priv->loops_running) { - is_driver_done = _gf_true; - } - } - UNLOCK (&sh_priv->lock); - - if (0 == loop) { - //loop finish does unlock, but the erasing of the pending - //xattrs needs to happen before that so do not finish the loop - if (is_driver_done && - !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) - goto driver_done; - if (old_loop_frame) { - sh_loop_finish (old_loop_frame, this); - old_loop_frame = NULL; - } - } - - //If we have more loops to form we should finish previous loop after - //the next loop lock - while (loop--) { - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - // op failed in other loop, stop spawning more loops - if (old_loop_frame) { - sh_loop_finish (old_loop_frame, this); - old_loop_frame = NULL; - } - sh_loop_driver (sh_frame, this, _gf_false, NULL); - } else { - gf_log (this->name, GF_LOG_TRACE, "spawning a loop " - "for offset %"PRId64, offset); - - sh_loop_start (sh_frame, this, offset, old_loop_frame); - old_loop_frame = NULL; - offset += block_size; - } - } - -driver_done: - if (is_driver_done) { - sh_loop_driver_done (sh_frame, this, old_loop_frame); - } - return 0; -} - -static int -sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t * loop_local = NULL; - afr_self_heal_t * loop_sh = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - if (loop_frame) { - loop_local = loop_frame->local; - if (loop_local) - loop_sh = &loop_local->self_heal; - if (loop_sh) - gf_log (this->name, GF_LOG_TRACE, "loop for offset " - "%"PRId64" returned", loop_sh->offset); - } - - if (op_ret == -1) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_set_error (sh, op_errno); - if (loop_frame) { - sh_loop_finish (loop_frame, this); - loop_frame = NULL; - } - } - - sh_loop_driver (sh_frame, this, _gf_false, loop_frame); - - return 0; -} - -static int -sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *postbuf, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * loop_local = NULL; - afr_self_heal_t * loop_sh = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - int call_count = 0; - int child_index = 0; - - priv = this->private; - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - sh_frame = loop_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - child_index = (long) cookie; - - gf_log (this->name, GF_LOG_TRACE, - "wrote %d bytes of data from %s to child %d, offset %"PRId64"", - op_ret, sh_local->loc.path, child_index, loop_sh->offset); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "write to %s failed on subvolume %s (%s)", - sh_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_set_error (loop_sh, op_errno); - } else if (op_ret < loop_local->cont.writev.vector->iov_len) { - gf_log (this->name, GF_LOG_ERROR, - "incomplete write to %s on subvolume %s " - "(expected %lu, returned %d)", sh_local->loc.path, - priv->children[child_index]->name, - loop_local->cont.writev.vector->iov_len, op_ret); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } - - call_count = afr_frame_return (loop_frame); - - if (call_count == 0) { - iobref_unref(loop_local->cont.writev.iobref); - - sh_loop_return (sh_frame, this, loop_frame, - loop_sh->op_ret, loop_sh->op_errno); - } - - return 0; -} - -static void -sh_prune_writes_needed (call_frame_t *sh_frame, call_frame_t *loop_frame, - afr_private_t *priv) -{ - afr_local_t *sh_local = NULL; - afr_self_heal_t *sh = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - int i = 0; - - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - if (!strcmp (sh->algo->name, "diff")) - return; - - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - /* full self-heal guarantees there exists atleast 1 file with size 0 - * That means for other files we can preserve holes that come after - * its size before 'trim' - */ - for (i = 0; i < priv->child_count; i++) { - if (loop_sh->write_needed[i] && - ((loop_sh->offset + 1) > sh->buf[i].ia_size)) - loop_sh->write_needed[i] = 0; - } -} - -static int -sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, struct iatt *buf, - struct iobref *iobref, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * loop_local = NULL; - afr_self_heal_t * loop_sh = NULL; - call_frame_t *sh_frame = NULL; - int i = 0; - int call_count = 0; - afr_local_t * sh_local = NULL; - afr_self_heal_t * sh = NULL; - - priv = this->private; - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - sh_frame = loop_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - gf_log (this->name, GF_LOG_TRACE, - "read %d bytes of data from %s, offset %"PRId64"", - op_ret, loop_local->loc.path, loop_sh->offset); - - if (op_ret <= 0) { - if (op_ret < 0) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - gf_log (this->name, GF_LOG_ERROR, "read failed on %d " - "for %s reason :%s", sh->source, - sh_local->loc.path, strerror (errno)); - } else { - sh->eof_reached = _gf_true; - gf_log (this->name, GF_LOG_DEBUG, "Eof reached for %s", - sh_local->loc.path); - } - sh_loop_return (sh_frame, this, loop_frame, op_ret, op_errno); - goto out; - } - - if (loop_sh->file_has_holes && iov_0filled (vector, count) == 0) - sh_prune_writes_needed (sh_frame, loop_frame, priv); - - call_count = sh_number_of_writes_needed (loop_sh->write_needed, - priv->child_count); - if (call_count == 0) { - sh_loop_return (sh_frame, this, loop_frame, 0, 0); - goto out; - } - - loop_local->call_count = call_count; - - /* - * We only really need the request size at the moment, but the buffer - * is required if we want to issue a retry in the event of a short write. - * Therefore, we duplicate the vector and ref the iobref here... - */ - loop_local->cont.writev.vector = iov_dup(vector, count); - loop_local->cont.writev.iobref = iobref_ref(iobref); - - for (i = 0; i < priv->child_count; i++) { - if (!loop_sh->write_needed[i]) - continue; - STACK_WIND_COOKIE (loop_frame, sh_loop_write_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->writev, - loop_sh->healing_fd, vector, count, - loop_sh->offset, 0, iobref, NULL); - - if (!--call_count) - break; - } - -out: - return 0; -} - - -static int -sh_loop_read (call_frame_t *loop_frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - - priv = this->private; - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - STACK_WIND_COOKIE (loop_frame, sh_loop_read_cbk, - (void *) (long) loop_sh->source, - priv->children[loop_sh->source], - priv->children[loop_sh->source]->fops->readv, - loop_sh->healing_fd, loop_sh->block_size, - loop_sh->offset, 0, NULL); - - return 0; -} - - -static int -sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - uint32_t weak_checksum, uint8_t *strong_checksum, - dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t *sh_local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_private_t *sh_priv = NULL; - int child_index = 0; - int call_count = 0; - int i = 0; - int write_needed = 0; - - priv = this->private; - - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - sh_frame = loop_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - sh_priv = sh->private; - - child_index = (long) cookie; - - if (op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "checksum on %s failed on subvolume %s (%s)", - sh_local->loc.path, priv->children[child_index]->name, - strerror (op_errno)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } else { - memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LENGTH, - strong_checksum, MD5_DIGEST_LENGTH); - } - - call_count = afr_frame_return (loop_frame); - - if (call_count == 0) { - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] || !sh_local->child_up[i]) - continue; - - if (memcmp (loop_sh->checksum + (i * MD5_DIGEST_LENGTH), - loop_sh->checksum + (sh->source * MD5_DIGEST_LENGTH), - MD5_DIGEST_LENGTH)) { - /* - Checksums differ, so this block - must be written to this sink - */ - - gf_log (this->name, GF_LOG_DEBUG, - "checksum on subvolume %s at offset %" - PRId64" differs from that on source", - priv->children[i]->name, loop_sh->offset); - - write_needed = loop_sh->write_needed[i] = 1; - } - } - - LOCK (&sh_priv->lock); - { - sh_priv->total_blocks++; - if (write_needed) - sh_priv->diff_blocks++; - } - UNLOCK (&sh_priv->lock); - - if (write_needed && - !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - sh_loop_read (loop_frame, this); - } else { - sh_loop_return (sh_frame, this, loop_frame, - op_ret, op_errno); - } - } - - return 0; -} - -static int -sh_diff_checksum (call_frame_t *loop_frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - int call_count = 0; - int i = 0; - - priv = this->private; - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - call_count = loop_sh->active_sinks + 1; /* sinks and source */ - - loop_local->call_count = call_count; - - STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk, - (void *) (long) loop_sh->source, - priv->children[loop_sh->source], - priv->children[loop_sh->source]->fops->rchecksum, - loop_sh->healing_fd, - loop_sh->offset, loop_sh->block_size, NULL); - - for (i = 0; i < priv->child_count; i++) { - if (loop_sh->sources[i] || !loop_local->child_up[i]) - continue; - - STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->rchecksum, - loop_sh->healing_fd, - loop_sh->offset, loop_sh->block_size, NULL); - - if (!--call_count) - break; - } - - return 0; -} - -static int -sh_full_read_write_to_sinks (call_frame_t *loop_frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - int i = 0; - - priv = this->private; - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - for (i = 0; i < priv->child_count; i++) { - if (loop_sh->sources[i] || !loop_local->child_up[i]) - continue; - loop_sh->write_needed[i] = 1; - } - sh_loop_read (loop_frame, this); - return 0; -} - -afr_sh_algo_private_t* -afr_sh_priv_init () -{ - afr_sh_algo_private_t *sh_priv = NULL; - - sh_priv = GF_CALLOC (1, sizeof (*sh_priv), - gf_afr_mt_afr_private_t); - if (!sh_priv) - goto out; - - LOCK_INIT (&sh_priv->lock); -out: - return sh_priv; -} - -int -afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, char *dom, - unsigned int child_count) -{ - afr_local_t *dst_local = NULL; - afr_self_heal_t *dst_sh = NULL; - afr_local_t *src_local = NULL; - afr_self_heal_t *src_sh = NULL; - int ret = -1; - - dst_local = dst->local; - dst_sh = &dst_local->self_heal; - src_local = src->local; - src_sh = &src_local->self_heal; - GF_ASSERT (src_sh->data_lock_held); - GF_ASSERT (!dst_sh->data_lock_held); - ret = afr_lk_transfer_datalock (dst, src, dom, child_count); - if (ret) - return ret; - src_sh->data_lock_held = _gf_false; - dst_sh->data_lock_held = _gf_true; - return 0; -} - -int -afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this, - afr_sh_algo_fn sh_data_algo_start) -{ - call_frame_t *first_loop_frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int ret = 0; - afr_private_t *priv = NULL; - - local = sh_frame->local; - sh = &local->self_heal; - priv = this->private; - - sh->sh_data_algo_start = sh_data_algo_start; - local->call_count = 0; - ret = sh_loop_frame_create (sh_frame, this, NULL, &first_loop_frame); - if (ret) - goto out; - ret = afr_sh_transfer_lock (first_loop_frame, sh_frame, this->name, - priv->child_count); - if (ret) - goto out; - sh->private = afr_sh_priv_init (); - if (!sh->private) { - ret = -1; - goto out; - } - sh_loop_driver (sh_frame, this, _gf_true, first_loop_frame); - ret = 0; -out: - if (ret) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - sh_loop_driver_done (sh_frame, this, NULL); - } - return 0; -} - -int -afr_sh_algo_diff (call_frame_t *sh_frame, xlator_t *this) -{ - afr_sh_start_loops (sh_frame, this, sh_diff_checksum); - return 0; -} - -int -afr_sh_algo_full (call_frame_t *sh_frame, xlator_t *this) -{ - afr_sh_start_loops (sh_frame, this, sh_full_read_write_to_sinks); - return 0; -} - -struct afr_sh_algorithm afr_self_heal_algorithms[] = { - {.name = "full", .fn = afr_sh_algo_full}, - {.name = "diff", .fn = afr_sh_algo_diff}, - {0, 0}, -}; diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.h b/xlators/cluster/afr/src/afr-self-heal-algorithm.h deleted file mode 100644 index 6b20789b1bb..00000000000 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef __AFR_SELF_HEAL_ALGORITHM_H__ -#define __AFR_SELF_HEAL_ALGORITHM_H__ - -typedef int (*afr_sh_algo_fn) (call_frame_t *frame, - xlator_t *this); - -struct afr_sh_algorithm { - const char *name; - afr_sh_algo_fn fn; -}; - -extern struct afr_sh_algorithm afr_self_heal_algorithms[3]; -typedef struct { - gf_lock_t lock; - unsigned int loops_running; - off_t offset; - - int32_t total_blocks; - int32_t diff_blocks; -} afr_sh_algo_private_t; - -#endif /* __AFR_SELF_HEAL_ALGORITHM_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index ef92b420551..4dac8311340 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,2805 +8,1002 @@ cases as published by the Free Software Foundation. */ -#include "glusterfs.h" -#include "xlator.h" -#include "byte-order.h" + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif #include "afr.h" -#include "afr-transaction.h" -#include "afr-self-heal-common.h" #include "afr-self-heal.h" -#include "pump.h" - -#define ADD_FMT_STRING(msg, off, sh_str, status, print_log) \ - do { \ - if (AFR_SELF_HEAL_NOT_ATTEMPTED != status) { \ - off += snprintf (msg + off, sizeof (msg) - off, \ - " "sh_str" self heal %s,", \ - get_sh_completion_status (status));\ - print_log = 1; \ - } \ - } while (0) - -#define ADD_FMT_STRING_SYNC(msg, off, sh_str, status, print_log) \ - do { \ - if (AFR_SELF_HEAL_SYNC_BEGIN == status || \ - AFR_SELF_HEAL_FAILED == status) { \ - off += snprintf (msg + off, sizeof (msg) - off, \ - " "sh_str" self heal %s,", \ - get_sh_completion_status (status));\ - print_log = 1; \ - } \ - } while (0) +#include "byte-order.h" -void -afr_sh_reset (call_frame_t *frame, xlator_t *this) +int +afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - memset (sh->child_errno, 0, - sizeof (*sh->child_errno) * priv->child_count); - memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count); - memset (sh->parentbufs, 0, - sizeof (*sh->parentbufs) * priv->child_count); - memset (sh->success, 0, sizeof (*sh->success) * priv->child_count); - memset (sh->locked_nodes, 0, - sizeof (*sh->locked_nodes) * priv->child_count); - sh->active_sinks = 0; - - afr_reset_xattr (sh->xattr, priv->child_count); -} + afr_local_t *local = NULL; -//Intersection[child]=1 if child is part of intersection -void -afr_children_intersection_get (int32_t *set1, int32_t *set2, - int *intersection, unsigned int child_count) -{ - int i = 0; - - memset (intersection, 0, sizeof (*intersection) * child_count); - for (i = 0; i < child_count; i++) { - intersection[i] = afr_is_child_present (set1, child_count, i) - && afr_is_child_present (set2, child_count, - i); - } + local = frame->local; + + syncbarrier_wake (&local->barrier); + + return 0; } -/** - * select_source - select a source and return it - */ int -afr_sh_select_source (int sources[], int child_count) +afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode, + int subvol, dict_t *xattr) { - int i = 0; - for (i = 0; i < child_count; i++) - if (sources[i]) - return i; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + loc_t loc = {0, }; - return -1; -} + priv = this->private; + local = frame->local; -void -afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this) -{ - int i = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int active_sinks = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] == 0 && local->child_up[i] == 1) { - active_sinks++; - sh->success[i] = 1; - } else if (sh->sources[i] == 1 && local->child_up[i] == 1) { - sh->success[i] = 1; - } - } - sh->active_sinks = active_sinks; -} + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); -int -afr_sh_source_count (int sources[], int child_count) -{ - int i = 0; - int nsource = 0; + STACK_WIND (frame, afr_selfheal_post_op_cbk, priv->children[subvol], + priv->children[subvol]->fops->xattrop, &loc, + GF_XATTROP_ADD_ARRAY, xattr, NULL); - for (i = 0; i < child_count; i++) - if (sources[i]) - nsource++; - return nsource; -} + syncbarrier_wait (&local->barrier, 1); -void -afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno) -{ - sh->op_ret = -1; - sh->op_errno = afr_most_important_error(sh->op_errno, op_errno, - _gf_false); + return 0; } -void -afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) -{ - afr_private_t * priv = this->private; - char *buf = NULL; - char *ptr = NULL; - int i = 0; - int j = 0; - - /* 10 digits per entry + 1 space + '[' and ']' */ - buf = GF_MALLOC (priv->child_count * 11 + 8, gf_afr_mt_char); - - for (i = 0; i < priv->child_count; i++) { - ptr = buf; - ptr += sprintf (ptr, "[ "); - for (j = 0; j < priv->child_count; j++) { - ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); - } - sprintf (ptr, "]"); - gf_log (this->name, GF_LOG_DEBUG, "pending_matrix: %s", buf); - } - - GF_FREE (buf); -} -char* -afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this) +dict_t * +afr_selfheal_output_xattr (xlator_t *this, afr_transaction_type type, + int *output_dirty, int **output_matrix, int subvol) { - afr_private_t * priv = this->private; - char *buf = NULL; - char *ptr = NULL; - int i = 0; - int j = 0; - int child_count = priv->child_count; - char *matrix_begin = "[ [ "; - char *matrix_end = "] ]"; - char *seperator = "] [ "; - int pending_entry_strlen = 12; //Including space after entry - int matrix_begin_strlen = 0; - int matrix_end_strlen = 0; - int seperator_strlen = 0; - int string_length = 0; - char *msg = "- Pending matrix: "; - - /* - * for a list of lists of [ [ a b ] [ c d ] ] - * */ - - matrix_begin_strlen = strlen (matrix_begin); - matrix_end_strlen = strlen (matrix_end); - seperator_strlen = strlen (seperator); - string_length = matrix_begin_strlen + matrix_end_strlen - + (child_count -1) * seperator_strlen - + (child_count * child_count * pending_entry_strlen); - - buf = GF_CALLOC (1, 1 + strlen (msg) + string_length , gf_afr_mt_char); - if (!buf) - goto out; - - ptr = buf; - ptr += sprintf (ptr, "%s", msg); - ptr += sprintf (ptr, "%s", matrix_begin); - for (i = 0; i < priv->child_count; i++) { - for (j = 0; j < priv->child_count; j++) { - ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); - } - if (i < priv->child_count -1) - ptr += sprintf (ptr, "%s", seperator); - } - - ptr += sprintf (ptr, "%s", matrix_end); + dict_t *xattr = NULL; + afr_private_t *priv = NULL; + int j = 0; + int idx = 0; + int ret = 0; + int *raw = 0; -out: - return buf; -} + priv = this->private; + idx = afr_index_for_transaction_type (type); -void -afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, - const char *loc) -{ - char *buf = NULL; - char *free_ptr = NULL; + xattr = dict_new (); + if (!xattr) + return NULL; - buf = afr_get_pending_matrix_str (pending_matrix, this); - if (buf) - free_ptr = buf; - else - buf = ""; + if (output_dirty[subvol]) { + /* clear dirty */ + raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t); + if (!raw) + goto err; + raw[idx] = hton32 (output_dirty[subvol]); + ret = dict_set_bin (xattr, AFR_DIRTY, raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) + goto err; + } - gf_log (this->name, GF_LOG_ERROR, "Unable to self-heal contents of '%s'" - " (possible split-brain). Please delete the file from all but " - "the preferred subvolume.%s", loc, buf); - GF_FREE (free_ptr); - return; -} + /* clear/set pending */ + for (j = 0; j < priv->child_count; j++) { + if (!output_matrix[subvol][j]) + continue; + raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS, + gf_afr_mt_int32_t); + if (!raw) + goto err; -void -afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count) -{ - int i = 0; - int j = 0; + raw[idx] = hton32 (output_matrix[subvol][j]); - GF_ASSERT (pending_matrix); + ret = dict_set_bin (xattr, priv->pending_key[j], + raw, sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) + goto err; + } - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - pending_matrix[i][j] = 0; - } - } + return xattr; +err: + if (xattr) + dict_unref (xattr); + return NULL; } -void -afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix, - unsigned char *ignorant_subvols, - size_t child_count) -{ - int i = 0; - int j = 0; - - GF_ASSERT (pending_matrix); - GF_ASSERT (ignorant_subvols); - - for (i = 0; i < child_count; i++) { - if (ignorant_subvols[i]) { - for (j = 0; j < child_count; j++) { - if (!ignorant_subvols[j]) - pending_matrix[j][i] += 1; - } - } - } -} int -afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, - unsigned char *ignorant_subvols, - dict_t *xattr[], afr_transaction_type type, - size_t child_count) -{ - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - int ret = -1; - int i = 0; - int j = 0; - int k = 0; - - afr_init_pending_matrix (pending_matrix, child_count); - - for (i = 0; i < child_count; i++) { - pending_raw = NULL; - - for (j = 0; j < child_count; j++) { - ret = dict_get_ptr (xattr[i], pending_key[j], - &pending_raw); - - if (ret != 0) { - /* - * There is no xattr present. This means this - * subvolume should be considered an 'ignorant' - * subvolume. - */ - - if (ignorant_subvols) - ignorant_subvols[i] = 1; - continue; - } - - memcpy (pending, pending_raw, sizeof(pending)); - k = afr_index_for_transaction_type (type); - - pending_matrix[i][j] = ntoh32 (pending[k]); - } - } - - return ret; -} +afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, afr_transaction_type type, + struct afr_reply *replies, unsigned char *locked_on) +{ + afr_private_t *priv = NULL; + int i = 0; + int j = 0; + unsigned char *pending = NULL; + int *input_dirty = NULL; + int **input_matrix = NULL; + int *output_dirty = NULL; + int **output_matrix = NULL; + dict_t *xattr = NULL; + + priv = this->private; + + pending = alloca0 (priv->child_count); + + input_dirty = alloca0 (priv->child_count * sizeof (int)); + input_matrix = ALLOC_MATRIX (priv->child_count, int); + output_dirty = alloca0 (priv->child_count * sizeof (int)); + output_matrix = ALLOC_MATRIX (priv->child_count, int); + + afr_selfheal_extract_xattr (this, replies, type, input_dirty, + input_matrix); + + for (i = 0; i < priv->child_count; i++) + if (sinks[i] && !healed_sinks[i]) + pending[i] = 1; + + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + if (pending[j]) + output_matrix[i][j] = 1; + else + output_matrix[i][j] = -input_matrix[i][j]; + } + } -typedef enum { - AFR_NODE_INVALID, - AFR_NODE_INNOCENT, - AFR_NODE_FOOL, - AFR_NODE_WISE, -} afr_node_type; + for (i = 0; i < priv->child_count; i++) { + if (!pending[i]) + output_dirty[i] = -input_dirty[i]; + } -typedef struct { - afr_node_type type; - int wisdom; -} afr_node_character; + for (i = 0; i < priv->child_count; i++) { + if (!locked_on[i]) + /* perform post-op only on subvols we had locked + and inspected on. + */ + continue; + xattr = afr_selfheal_output_xattr (this, type, output_dirty, + output_matrix, i); + if (!xattr) { + gf_log (this->name, GF_LOG_ERROR, + "unable to allocate xdata for subvol %d", i); + continue; + } -static int -afr_sh_is_innocent (int32_t *array, int child_count) -{ - int i = 0; - int ret = 1; /* innocent until proven guilty */ + afr_selfheal_post_op (frame, this, inode, i, xattr); - for (i = 0; i < child_count; i++) { - if (array[i]) { - ret = 0; - break; - } - } + dict_unref (xattr); + } - return ret; + return 0; } -static int -afr_sh_is_fool (int32_t *array, int i, int child_count) -{ - return array[i]; /* fool if accuses itself */ +void +afr_replies_copy (struct afr_reply *dst, struct afr_reply *src, int count) +{ + int i = 0; + dict_t *xdata = NULL; + + if (dst == src) + return; + + for (i = 0; i < count; i++) { + dst[i].valid = src[i].valid; + dst[i].op_ret = src[i].op_ret; + dst[i].op_errno = src[i].op_errno; + dst[i].prestat = src[i].prestat; + dst[i].poststat = src[i].poststat; + dst[i].preparent = src[i].preparent; + dst[i].postparent = src[i].postparent; + dst[i].preparent2 = src[i].preparent2; + dst[i].postparent2 = src[i].postparent2; + if (src[i].xdata) + xdata = dict_ref (src[i].xdata); + else + xdata = NULL; + if (dst[i].xdata) + dict_unref (dst[i].xdata); + dst[i].xdata = xdata; + memcpy (dst[i].checksum, src[i].checksum, + MD5_DIGEST_LENGTH); + } } -static int -afr_sh_is_wise (int32_t *array, int i, int child_count) +int +afr_selfheal_fill_dirty (xlator_t *this, int *dirty, int subvol, + int idx, dict_t *xdata) { - return !array[i]; /* wise if does not accuse itself */ -} + void *pending_raw = NULL; + int pending[3] = {0, }; + if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw)) + return -1; -static int -afr_sh_all_nodes_innocent (afr_node_character *characters, - int child_count) -{ - int i = 0; - int ret = 1; + if (!pending_raw) + return -1; + + memcpy (pending, pending_raw, sizeof(pending)); - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_INNOCENT) { - ret = 0; - break; - } - } + dirty[subvol] = ntoh32 (pending[idx]); - return ret; + return 0; } -static int -afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count) +int +afr_selfheal_fill_matrix (xlator_t *this, int **matrix, int subvol, + int idx, dict_t *xdata) { - int i = 0; - int ret = 0; + int i = 0; + void *pending_raw = NULL; + int pending[3] = {0, }; + afr_private_t *priv = NULL; - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_WISE) { - ret = 1; - break; - } - } + priv = this->private; - return ret; -} + for (i = 0; i < priv->child_count; i++) { + if (dict_get_ptr (xdata, priv->pending_key[i], &pending_raw)) + continue; + if (!pending_raw) + continue; -/* - * The 'wisdom' of a wise node is 0 if any other wise node accuses it. - * It is 1 if no other wise node accuses it. - * Only wise nodes with wisdom 1 are sources. - * - * If no nodes with wisdom 1 exist, a split-brain has occurred. - */ + memcpy (pending, pending_raw, sizeof(pending)); -static void -afr_sh_compute_wisdom (int32_t *pending_matrix[], - afr_node_character characters[], int child_count) -{ - int i = 0; - int j = 0; - - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_WISE) { - characters[i].wisdom = 1; - - for (j = 0; j < child_count; j++) { - if ((characters[j].type == AFR_NODE_WISE) - && pending_matrix[j][i]) { - - characters[i].wisdom = 0; - } - } - } - } + matrix[subvol][i] = ntoh32 (pending[idx]); + } + + return 0; } -static int -afr_sh_wise_nodes_conflict (afr_node_character *characters, - int child_count) +int +afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies, + afr_transaction_type type, int *dirty, int **matrix) { - int i = 0; - int ret = 1; + afr_private_t *priv = NULL; + int i = 0; + dict_t *xdata = NULL; + int idx = -1; + + idx = afr_index_for_transaction_type (type); - for (i = 0; i < child_count; i++) { - if ((characters[i].type == AFR_NODE_WISE) - && characters[i].wisdom == 1) { + priv = this->private; - /* There is atleast one bona-fide wise node */ - ret = 0; - break; - } - } + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].xdata) + continue; + + xdata = replies[i].xdata; - return ret; + afr_selfheal_fill_dirty (this, dirty, i, idx, xdata); + afr_selfheal_fill_matrix (this, matrix, i, idx, xdata); + } + + return 0; } -static int -afr_sh_mark_wisest_as_sources (int sources[], - afr_node_character *characters, - int child_count) -{ - int nsources = 0; - int i = 0; - for (i = 0; i < child_count; i++) { - if (characters[i].wisdom == 1) { - sources[i] = 1; - nsources++; - } - } +/* + * This function determines if a self-heal is required for a given inode, + * and if needed, in what direction. + * + * locked_on[] is the array representing servers which have been locked and + * from which xattrs have been fetched for analysis. + * + * The output of the function is by filling the arrays sources[] and sinks[]. + * + * sources[i] is set if i'th server is an eligible source for a selfheal. + * + * sinks[i] is set if i'th server needs to be healed. + * + * if sources[0..N] are all set, there is no need for a selfheal. + * + * if sinks[0..N] are all set, the inode is in split brain. + * + */ - return nsources; -} +int +afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + afr_transaction_type type, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks) +{ + afr_private_t *priv = NULL; + int i = 0; + int j = 0; + int *dirty = NULL; + int **matrix = NULL; + char *accused = NULL; + + priv = this->private; + + dirty = alloca0 (priv->child_count * sizeof (int)); + accused = alloca0 (priv->child_count); + matrix = ALLOC_MATRIX(priv->child_count, int); + + /* First construct the pending matrix for further analysis */ + afr_selfheal_extract_xattr (this, replies, type, dirty, matrix); + + /* Next short list all accused to exclude them from being sources */ + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + if (matrix[i][j]) + accused[j] = 1; + } + } -static void -afr_compute_witness_of_fools (int32_t *witnesses, int32_t **pending_matrix, - afr_node_character *characters, - int32_t child_count) -{ - int i = 0; - int j = 0; - int witness = 0; - - GF_ASSERT (witnesses); - GF_ASSERT (pending_matrix); - GF_ASSERT (characters); - GF_ASSERT (child_count > 0); - - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_FOOL) - continue; - - witness = 0; - for (j = 0; j < child_count; j++) { - if (i == j) - continue; - witness += pending_matrix[i][j]; - } - witnesses[i] = witness; - } -} + /* Short list all non-accused as sources */ + memset (sources, 0, priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (!accused[i] && locked_on[i]) + sources[i] = 1; + } -static int32_t -afr_find_biggest_witness_among_fools (int32_t *witnesses, - afr_node_character *characters, - int32_t child_count) -{ - int i = 0; - int biggest_witness = -1; - int biggest_witness_idx = -1; - int biggest_witness_cnt = -1; - - GF_ASSERT (witnesses); - GF_ASSERT (characters); - GF_ASSERT (child_count > 0); - - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_FOOL) - continue; - - if (biggest_witness < witnesses[i]) { - biggest_witness = witnesses[i]; - biggest_witness_idx = i; - biggest_witness_cnt = 1; + /* Everyone accused by sources are sinks */ + memset (sinks, 0, priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) continue; + for (j = 0; j < priv->child_count; j++) { + if (matrix[i][j]) + sinks[j] = 1; } + } - if (biggest_witness == witnesses[i]) - biggest_witness_cnt++; - } + /* If any source has 'dirty' bit, pick first + 'dirty' source and make everybody else sinks */ + for (i = 0; i < priv->child_count; i++) { + if (sources[i] && dirty[i]) { + for (j = 0; j < priv->child_count; j++) { + if (j != i) { + sources[j] = 0; + sinks[j] = 1; + } + } + break; + } + } - if (biggest_witness_cnt != 1) - return -1; + /* If no sources, all locked nodes are sinks - split brain */ + if (AFR_COUNT (sources, priv->child_count) == 0) { + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) + sinks[i] = 1; + } + } - return biggest_witness_idx; + return 0; } + int -afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses, - afr_node_character *characters, - int32_t child_count, int32_t witness) +afr_selfheal_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *parbuf) { - int i = 0; - int nsources = 0; - - GF_ASSERT (sources); - GF_ASSERT (witnesses); - GF_ASSERT (characters); - GF_ASSERT (child_count > 0); - - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_FOOL) - continue; - - if (witness == witnesses[i]) { - sources[i] = 1; - nsources++; - } - } - return nsources; -} + afr_local_t *local = NULL; + int i = -1; + local = frame->local; + i = (long) cookie; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (buf) + local->replies[i].poststat = *buf; + if (parbuf) + local->replies[i].postparent = *parbuf; + if (xdata) + local->replies[i].xdata = dict_ref (xdata); + + syncbarrier_wake (&local->barrier); -int -afr_mark_fool_as_source_by_idx (int32_t *sources, int child_count, int idx) -{ - if (idx >= 0 && idx < child_count) { - sources[idx] = 1; - return 1; - } return 0; } -static int -afr_find_largest_file_size (struct iatt *bufs, int32_t *success_children, - int child_count) +inode_t * +afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent, + const char *name, struct afr_reply *replies, + unsigned char *lookup_on) { - int idx = -1; - int i = -1; - int child = -1; - uint64_t max_size = 0; - uint64_t min_size = 0; - int num_children = 0; + loc_t loc = {0, }; + dict_t *xattr_req = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + inode_t *inode = NULL; - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; + local = frame->local; + priv = frame->this->private; - child = success_children[i]; - if (bufs[child].ia_size > max_size) { - max_size = bufs[child].ia_size; - idx = child; - } - - if ((num_children == 0) || (bufs[child].ia_size < min_size)) { - min_size = bufs[child].ia_size; - } + xattr_req = dict_new (); + if (!xattr_req) + return NULL; - num_children++; + if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) { + dict_destroy (xattr_req); + return NULL; } - /* If sizes are same for all of them, finding sources will have to - * happen with pending changelog. So return -1 - */ - if ((num_children > 1) && (min_size == max_size)) - return -1; - return idx; -} + inode = inode_new (parent->table); + if (!inode) { + dict_destroy (xattr_req); + return NULL; + } + loc.parent = inode_ref (parent); + uuid_copy (loc.pargfid, parent->gfid); + loc.name = name; + loc.inode = inode_ref (inode); -static int -afr_find_newest_file (struct iatt *bufs, int32_t *success_children, - int child_count) -{ - int idx = -1; - int i = -1; - int child = -1; - uint64_t max_ctime = 0; + AFR_ONLIST (lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xattr_req); - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; + afr_replies_copy (replies, local->replies, priv->child_count); - child = success_children[i]; - if (bufs[child].ia_ctime > max_ctime) { - max_ctime = bufs[child].ia_ctime; - idx = child; - } - } + loc_wipe (&loc); + dict_unref (xattr_req); - return idx; + return inode; } -static int -afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, - afr_node_character *characters, - int32_t *success_children, - int child_count, struct iatt *bufs) -{ - int32_t biggest_witness = 0; - int nsources = 0; - int32_t *witnesses = NULL; - - GF_ASSERT (child_count > 0); - - biggest_witness = afr_find_largest_file_size (bufs, success_children, - child_count); - if (biggest_witness != -1) - goto found; - - witnesses = GF_CALLOC (child_count, sizeof (*witnesses), - gf_afr_mt_int32_t); - if (NULL == witnesses) { - nsources = -1; - goto out; - } - - afr_compute_witness_of_fools (witnesses, pending_matrix, characters, - child_count); - biggest_witness = afr_find_biggest_witness_among_fools (witnesses, - characters, - child_count); - if (biggest_witness != -1) - goto found; - - biggest_witness = afr_find_newest_file (bufs, success_children, - child_count); - -found: - nsources = afr_mark_fool_as_source_by_idx (sources, child_count, - biggest_witness); -out: - GF_FREE (witnesses); - return nsources; -} - int -afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs, - int32_t *success_children, - unsigned int child_count, uint32_t uid) +afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies, + unsigned char *discover_on) { - int i = 0; - int nsources = 0; - int child = 0; - - for (i = 0; i < child_count; i++) { - if (-1 == success_children[i]) - break; - - child = success_children[i]; - if (uid == bufs[child].ia_uid) { - sources[child] = 1; - nsources++; - } - } - return nsources; -} + loc_t loc = {0, }; + dict_t *xattr_req = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; -int -afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *success_children, - unsigned int child_count) -{ - int i = 0; - int smallest = -1; - int child = 0; - - for (i = 0; i < child_count; i++) { - if (-1 == success_children[i]) - break; - child = success_children[i]; - if ((smallest == -1) || - (bufs[child].ia_uid < bufs[smallest].ia_uid)) { - smallest = child; - } - } - return smallest; -} + local = frame->local; + priv = frame->this->private; -static int -afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *success_children, - int child_count, int32_t *sources) -{ - int nsources = 0; - int smallest = 0; - - smallest = afr_get_child_with_lowest_uid (bufs, success_children, - child_count); - if (smallest < 0) { - nsources = -1; - goto out; - } - nsources = afr_mark_child_as_source_by_uid (sources, bufs, - success_children, child_count, - bufs[smallest].ia_uid); -out: - return nsources; -} + xattr_req = dict_new (); + if (!xattr_req) + return -ENOMEM; -int -afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children, - struct iatt *bufs) -{ - afr_private_t *priv = NULL; - int i = 0; - int child = -1; - int read_child = -1; - - priv = this->private; - for (i = 0; i < priv->child_count; i++) { - child = success_children[i]; - if (child < 0) - break; - if (read_child < 0) - read_child = child; - else if (bufs[read_child].ia_size < bufs[child].ia_size) - read_child = child; - } - return read_child; -} + if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) { + dict_destroy (xattr_req); + return -ENOMEM; + } -int -afr_sh_mark_zero_size_file_as_sink (struct iatt *bufs, int32_t *success_children, - int child_count, int32_t *sources) -{ - int nsources = 0; - int i = 0; - int child = 0; - gf_boolean_t sink_exists = _gf_false; - gf_boolean_t source_exists = _gf_false; - int source = -1; - - for (i = 0; i < child_count; i++) { - child = success_children[i]; - if (child < 0) - break; - if (!bufs[child].ia_size) { - sink_exists = _gf_true; - continue; - } - if (!source_exists) { - source_exists = _gf_true; - source = child; - continue; - } - if (bufs[source].ia_size != bufs[child].ia_size) { - nsources = -1; - goto out; - } - } - if (!source_exists && !sink_exists) { - nsources = -1; - goto out; - } - - if (!source_exists || !sink_exists) - goto out; - - for (i = 0; i < child_count; i++) { - child = success_children[i]; - if (child < 0) - break; - if (bufs[child].ia_size) { - sources[child] = 1; - nsources++; - } - } -out: - return nsources; -} + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, gfid); -char * -afr_get_character_str (afr_node_type type) -{ - char *character = NULL; - - switch (type) { - case AFR_NODE_INNOCENT: - character = "innocent"; - break; - case AFR_NODE_FOOL: - character = "fool"; - break; - case AFR_NODE_WISE: - character = "wise"; - break; - default: - character = "invalid"; - break; - } - return character; -} + AFR_ONLIST (discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xattr_req); -afr_node_type -afr_find_child_character_type (int32_t *pending_row, int32_t child, - unsigned int child_count) -{ - afr_node_type type = AFR_NODE_INVALID; + afr_replies_copy (replies, local->replies, priv->child_count); - GF_ASSERT ((child >= 0) && (child < child_count)); + loc_wipe (&loc); + dict_unref (xattr_req); - if (afr_sh_is_innocent (pending_row, child_count)) - type = AFR_NODE_INNOCENT; - else if (afr_sh_is_fool (pending_row, child, child_count)) - type = AFR_NODE_FOOL; - else if (afr_sh_is_wise (pending_row, child, child_count)) - type = AFR_NODE_WISE; - return type; + return 0; } int -afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, - int32_t **pending_matrix, int32_t *sources, - int32_t *success_children, afr_transaction_type type, - int32_t *subvol_status, gf_boolean_t ignore_ignorant) +afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies) { - afr_private_t *priv = NULL; - afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; - int nsources = -1; - unsigned char *ignorant_subvols = NULL; - unsigned int child_count = 0; - - priv = this->private; - child_count = priv->child_count; - - if (afr_get_children_count (success_children, priv->child_count) == 0) - goto out; - - if (!ignore_ignorant) { - ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), - child_count, gf_afr_mt_char); - if (NULL == ignorant_subvols) - goto out; - } - - afr_build_pending_matrix (priv->pending_key, pending_matrix, - ignorant_subvols, xattr, type, - priv->child_count); - - if (!ignore_ignorant) - afr_mark_ignorant_subvols_as_pending (pending_matrix, - ignorant_subvols, - priv->child_count); - sh_type = afr_self_heal_type_for_transaction (type); - if (AFR_SELF_HEAL_INVALID == sh_type) - goto out; - - afr_sh_print_pending_matrix (pending_matrix, this); - - nsources = afr_mark_sources (this, sources, pending_matrix, bufs, - sh_type, success_children, subvol_status); -out: - GF_FREE (ignorant_subvols); - return nsources; -} + afr_private_t *priv = NULL; -void -afr_find_character_types (afr_node_character *characters, - int32_t **pending_matrix, int32_t *success_children, - unsigned int child_count) -{ - afr_node_type type = AFR_NODE_INVALID; - int child = 0; - int i = 0; - - for (i = 0; i < child_count; i++) { - child = success_children[i]; - if (child == -1) - break; - type = afr_find_child_character_type (pending_matrix[child], - child, child_count); - characters[child].type = type; - } -} + priv = frame->this->private; -void -afr_mark_success_children_sources (int32_t *sources, int32_t *success_children, - unsigned int child_count) -{ - int i = 0; - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - sources[success_children[i]] = 1; - } + return afr_selfheal_unlocked_discover_on (frame, inode, gfid, replies, + priv->child_up); } -/** - * mark_sources: Mark all 'source' nodes and return number of source - * nodes found - * - * A node (a row in the pending matrix) belongs to one of - * three categories: - * - * M is the pending matrix. - * - * 'innocent' - M[i] is all zeroes - * 'fool' - M[i] has i'th element = 1 (self-reference) - * 'wise' - M[i] has i'th element = 0, others are 1 or 0. - * - * All 'innocent' nodes are sinks. If all nodes are innocent, no self-heal is - * needed. - * - * A 'wise' node can be a source. If two 'wise' nodes conflict, it is - * a split-brain. If one wise node refers to the other but the other doesn't - * refer back, the referrer is a source. - * - * All fools are sinks, unless there are no 'wise' nodes. In that case, - * one of the fools is made a source. - */ + int -afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, - struct iatt *bufs, afr_self_heal_type type, - int32_t *success_children, int32_t *subvol_status) +afr_selfheal_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - /* stores the 'characters' (innocent, fool, wise) of the nodes */ - afr_node_character *characters = NULL; - int nsources = -1; - unsigned int child_count = 0; - afr_private_t *priv = NULL; - - priv = this->private; - child_count = priv->child_count; - characters = GF_CALLOC (sizeof (afr_node_character), - child_count, gf_afr_mt_afr_node_character); - if (!characters) - goto out; - - this = THIS; - - /* start clean */ - memset (sources, 0, sizeof (*sources) * child_count); - nsources = 0; - afr_find_character_types (characters, pending_matrix, success_children, - child_count); - if (afr_sh_all_nodes_innocent (characters, child_count)) { - switch (type) { - case AFR_SELF_HEAL_METADATA: - nsources = afr_sh_mark_lowest_uid_as_source (bufs, - success_children, - child_count, - sources); - break; - case AFR_SELF_HEAL_DATA: - nsources = afr_sh_mark_zero_size_file_as_sink (bufs, - success_children, - child_count, - sources); - if ((nsources < 0) && subvol_status) - *subvol_status |= SPLIT_BRAIN; - break; - default: - break; - } - goto out; - } - - if (afr_sh_wise_nodes_exist (characters, child_count)) { - afr_sh_compute_wisdom (pending_matrix, characters, child_count); - - if (afr_sh_wise_nodes_conflict (characters, child_count)) { - if (subvol_status) - *subvol_status |= SPLIT_BRAIN; - nsources = -1; - } else { - nsources = afr_sh_mark_wisest_as_sources (sources, - characters, - child_count); - } - } else { - if (subvol_status) - *subvol_status |= ALL_FOOLS; - nsources = afr_mark_biggest_of_fools_as_source (sources, - pending_matrix, - characters, - success_children, - child_count, bufs); - } + afr_local_t *local = NULL; + int i = 0; -out: - if (nsources == 0) - afr_mark_success_children_sources (sources, success_children, - child_count); - GF_FREE (characters); + local = frame->local; + i = (long) cookie; - gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources); - return nsources; -} + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; -void -afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, - int32_t *delta_matrix[], unsigned char success[], - int child_count, afr_transaction_type type) -{ - int tgt = 0; - int src = 0; - int value = 0; - - afr_build_pending_matrix (priv->pending_key, delta_matrix, NULL, - xattr, type, priv->child_count); - - /* - * The algorithm here has two parts. First, for each subvol indexed - * as tgt, we try to figure out what count everyone should have for it. - * If the self-heal succeeded, that's easy; the value is zero. - * Otherwise, the value is the maximum of the succeeding nodes' counts. - * Once we know the value, we loop through (possibly for a second time) - * setting each count to the difference so that when we're done all - * succeeding nodes will have the same count for tgt. - */ - for (tgt = 0; tgt < priv->child_count; ++tgt) { - value = 0; - if (!success[tgt]) { - /* Find the maximum. */ - for (src = 0; src < priv->child_count; ++src) { - if (!success[src]) { - continue; - } - if (delta_matrix[src][tgt] > value) { - value = delta_matrix[src][tgt]; - } - } - } - /* Force everyone who succeeded to the chosen value. */ - for (src = 0; src < priv->child_count; ++src) { - if (success[src]) { - delta_matrix[src][tgt] = value - - delta_matrix[src][tgt]; - } - else { - delta_matrix[src][tgt] = 0; - } - } - } + syncbarrier_wake (&local->barrier); + + return 0; } int -afr_sh_delta_to_xattr (xlator_t *this, - int32_t *delta_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type) -{ - int i = 0; - int j = 0; - int k = 0; - int ret = 0; - int32_t *pending = NULL; - int32_t *local_pending = NULL; - afr_private_t *priv = NULL; - - priv = this->private; - for (i = 0; i < child_count; i++) { - if (!xattr[i]) - continue; - - local_pending = NULL; - for (j = 0; j < child_count; j++) { - pending = GF_CALLOC (sizeof (int32_t), 3, - gf_afr_mt_int32_t); - - if (!pending) { - gf_log (this->name, GF_LOG_ERROR, - "failed to allocate pending entry " - "for %s[%d] on %s", - priv->pending_key[j], type, - priv->children[i]->name); - continue; - } - /* 3 = data+metadata+entry */ - - k = afr_index_for_transaction_type (type); - - pending[k] = hton32 (delta_matrix[i][j]); - - if (j == i) { - local_pending = pending; - continue; - } - ret = dict_set_bin (xattr[i], priv->pending_key[j], - pending, - AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); - GF_FREE (pending); - } - } - if (local_pending) { - ret = dict_set_bin (xattr[i], priv->pending_key[i], - local_pending, - AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); - GF_FREE (local_pending); - } - } - } - return 0; +afr_selfheal_locked_fill (call_frame_t *frame, xlator_t *this, + unsigned char *locked_on) +{ + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int count = 0; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && local->replies[i].op_ret == 0) { + locked_on[i] = 1; + count++; + } else { + locked_on[i] = 0; + } + } + + return count; } int -afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) +afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - afr_sh_reset (frame, this); - - if (local->unhealable) { - gf_log (this->name, GF_LOG_DEBUG, - "split brain found, aborting selfheal of %s", - local->loc.path); - } - - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - sh->completion_cbk (frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to metadata check on %s", - local->loc.path); - afr_self_heal_metadata (frame, this); - } - - return 0; -} + loc_t loc = {0,}; + struct gf_flock flock = {0, }; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); -static int -afr_sh_missing_entries_finish (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; - local = frame->local; - int_lock = &local->internal_lock; + AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom, + &loc, F_SETLK, &flock, NULL); - int_lock->lock_cbk = afr_sh_missing_entries_done; - afr_unlock (frame, this); + loc_wipe (&loc); - return 0; + return afr_selfheal_locked_fill (frame, this, locked_on); } + int -afr_sh_common_create (afr_self_heal_t *sh, unsigned int child_count) -{ - int ret = -ENOMEM; - sh->buf = GF_CALLOC (child_count, sizeof (*sh->buf), - gf_afr_mt_iatt); - if (!sh->buf) - goto out; - sh->parentbufs = GF_CALLOC (child_count, sizeof (*sh->parentbufs), - gf_afr_mt_iatt); - if (!sh->parentbufs) - goto out; - sh->child_errno = GF_CALLOC (child_count, sizeof (*sh->child_errno), - gf_afr_mt_int); - if (!sh->child_errno) - goto out; - sh->success_children = afr_children_create (child_count); - if (!sh->success_children) - goto out; - sh->fresh_children = afr_children_create (child_count); - if (!sh->fresh_children) - goto out; - sh->xattr = GF_CALLOC (child_count, sizeof (*sh->xattr), - gf_afr_mt_dict_t); - if (!sh->xattr) - goto out; - ret = 0; -out: - return ret; -} +afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on) +{ + loc_t loc = {0,}; + struct gf_flock flock = {0, }; + afr_local_t *local = NULL; + int i = 0; + afr_private_t *priv = NULL; + + priv = this->private; + local = frame->local; + + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); + + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; + + AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom, + &loc, F_SETLK, &flock, NULL); + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == -1 && + local->replies[i].op_errno == EAGAIN) { + afr_selfheal_locked_fill (frame, this, locked_on); + afr_selfheal_uninodelk (frame, this, inode, dom, off, + size, locked_on); + + AFR_SEQ (frame, afr_selfheal_lock_cbk, inodelk, dom, + &loc, F_SETLKW, &flock, NULL); + break; + } + } -void -afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, - dict_t *xattr, struct iatt *postparent, - loc_t *loc) -{ - int child_index = 0; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == 0) { - sh->buf[child_index] = *buf; - sh->parentbufs[child_index] = *postparent; - sh->success_children[sh->success_count] = child_index; - sh->success_count++; - sh->xattr[child_index] = dict_ref (xattr); - } else { - gf_log (this->name, GF_LOG_DEBUG, "path %s on subvolume" - " %s => -1 (%s)", loc->path, - priv->children[child_index]->name, - strerror (op_errno)); - local->self_heal.child_errno[child_index] = op_errno; - } - } - UNLOCK (&frame->lock); - return; -} + loc_wipe (&loc); -gf_boolean_t -afr_valid_ia_type (ia_type_t ia_type) -{ - switch (ia_type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - case IA_IFLNK: - case IA_IFDIR: - return _gf_true; - default: - return _gf_false; - } - return _gf_false; + return afr_selfheal_locked_fill (frame, this, locked_on); } + int -afr_impunge_frame_create (call_frame_t *frame, xlator_t *this, - int active_source, call_frame_t **impunge_frame) +afr_selfheal_uninodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + const unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int32_t op_errno = 0; - afr_private_t *priv = NULL; - int ret = 0; - call_frame_t *new_frame = NULL; - - op_errno = ENOMEM; - priv = this->private; - new_frame = copy_frame (frame); - if (!new_frame) { - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (impunge_local, out); - - local = frame->local; - new_frame->local = impunge_local; - impunge_sh = &impunge_local->self_heal; - impunge_sh->sh_frame = frame; - impunge_sh->active_source = active_source; - impunge_local->child_up = memdup (local->child_up, - sizeof (*local->child_up) * - priv->child_count); - if (!impunge_local->child_up) - goto out; - - impunge_local->pending = afr_matrix_create (priv->child_count, - AFR_NUM_CHANGE_LOGS); - if (!impunge_local->pending) - goto out; - - ret = afr_sh_common_create (impunge_sh, priv->child_count); - if (ret) { - op_errno = -ret; - goto out; - } - op_errno = 0; - *impunge_frame = new_frame; -out: - if (op_errno && new_frame) - AFR_STACK_DESTROY (new_frame); - return -op_errno; -} + loc_t loc = {0,}; + struct gf_flock flock = {0, }; -void -afr_sh_missing_entry_call_impunge_recreate (call_frame_t *frame, xlator_t *this, - struct iatt *buf, - struct iatt *postparent, - afr_impunge_done_cbk_t impunge_done) -{ - call_frame_t *impunge_frame = NULL; - afr_local_t *local = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *impunge_sh = NULL; - int ret = 0; - unsigned int enoent_count = 0; - afr_private_t *priv = NULL; - int i = 0; - int32_t op_errno = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - enoent_count = afr_errno_count (NULL, sh->child_errno, - priv->child_count, ENOENT); - if (!enoent_count) { - gf_log (this->name, GF_LOG_INFO, - "no missing files - %s. proceeding to metadata check", - local->loc.path); - goto out; - } - sh->impunge_done = impunge_done; - ret = afr_impunge_frame_create (frame, this, sh->source, &impunge_frame); - if (ret) - goto out; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - loc_copy (&impunge_local->loc, &local->loc); - ret = afr_build_parent_loc (&impunge_sh->parent_loc, - &impunge_local->loc, &op_errno); - if (ret) { - ret = -op_errno; - goto out; - } - impunge_local->call_count = enoent_count; - impunge_sh->entrybuf = sh->buf[sh->source]; - impunge_sh->parentbuf = sh->parentbufs[sh->source]; - for (i = 0; i < priv->child_count; i++) { - if (!impunge_local->child_up[i]) { - impunge_sh->child_errno[i] = ENOTCONN; - continue; - } - if (sh->child_errno[i] != ENOENT) { - impunge_sh->child_errno[i] = EEXIST; - continue; - } - } - for (i = 0; i < priv->child_count; i++) { - if (sh->child_errno[i] != ENOENT) - continue; - afr_sh_entry_impunge_create (impunge_frame, this, i); - enoent_count--; - } - GF_ASSERT (!enoent_count); - return; -out: - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, " - "reason: %s", local->loc.path, strerror (-ret)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } - afr_sh_missing_entries_finish (frame, this); -} -int -afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - if (op_ret < 0) - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_missing_entries_finish (frame, this); - return 0; -} + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); -static int -sh_missing_entries_create (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int type = 0; - struct iatt *buf = NULL; - struct iatt *postparent = NULL; - - local = frame->local; - sh = &local->self_heal; - - buf = &sh->buf[sh->source]; - postparent = &sh->parentbufs[sh->source]; - - type = buf->ia_type; - if (!afr_valid_ia_type (type)) { - gf_log (this->name, GF_LOG_ERROR, - "%s: unknown file type: 0%o", local->loc.path, type); - afr_set_local_for_unhealable (local); - afr_sh_missing_entries_finish (frame, this); - goto out; - } - - afr_sh_missing_entry_call_impunge_recreate (frame, this, - buf, postparent, - afr_sh_create_entry_cbk); -out: - return 0; -} + flock.l_type = F_UNLCK; + flock.l_start = off; + flock.l_len = size; -void -afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - ia_type_t ia_type = IA_INVAL; - int32_t nsources = 0; - loc_t *loc = NULL; - int32_t subvol_status = 0; - afr_transaction_type txn_type = AFR_DATA_TRANSACTION; - gf_boolean_t split_brain = _gf_false; - int read_child = -1; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - loc = &local->loc; - - if (op_ret < 0) { - if (op_errno == EIO) { - afr_set_local_for_unhealable (local); - } - // EIO can happen if finding the fresh parent dir failed - goto out; - } - - //now No chance for the ia_type to conflict - ia_type = sh->buf[sh->success_children[0]].ia_type; - txn_type = afr_transaction_type_get (ia_type); - nsources = afr_build_sources (this, sh->xattr, sh->buf, - sh->pending_matrix, sh->sources, - sh->success_children, txn_type, - &subvol_status, _gf_false); - if (nsources < 0) { - gf_log (this->name, GF_LOG_INFO, "No sources for dir of %s," - " in missing entry self-heal, continuing with the rest" - " of the self-heals", local->loc.path); - if (subvol_status & SPLIT_BRAIN) { - split_brain = _gf_true; - switch (txn_type) { - case AFR_DATA_TRANSACTION: - nsources = 1; - sh->sources[sh->success_children[0]] = 1; - break; - case AFR_ENTRY_TRANSACTION: - read_child = afr_get_no_xattr_dir_read_child - (this, - sh->success_children, - sh->buf); - sh->sources[read_child] = 1; - nsources = 1; - break; - default: - op_errno = EIO; - goto out; - } - } else { - op_errno = EIO; - goto out; - } - } - - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); - sh->source = sh->fresh_children[0]; - if (sh->source == -1) { - gf_log (this->name, GF_LOG_DEBUG, "No active sources found."); - op_errno = EIO; - goto out; - } - - if (sh->gfid_sh_success_cbk) - sh->gfid_sh_success_cbk (frame, this); - sh->type = sh->buf[sh->source].ia_type; - if (uuid_is_null (loc->inode->gfid)) - uuid_copy (loc->gfid, sh->buf[sh->source].ia_gfid); - if (split_brain) { - afr_sh_missing_entries_finish (frame, this); - } else { - sh_missing_entries_create (frame, this); - } - return; -out: - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_set_error (sh, op_errno); - afr_sh_missing_entries_finish (frame, this); - return; -} + AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, inodelk, + dom, &loc, F_SETLK, &flock, NULL); -static int -afr_sh_common_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, - op_errno, inode, buf, xattr, - postparent, &sh->lookup_loc); - call_count = afr_frame_return (frame); - - if (call_count) - goto out; - op_ret = -1; - if (!sh->success_count) { - op_errno = afr_resultant_errno_get (NULL, sh->child_errno, - priv->child_count); - gf_log (this->name, GF_LOG_ERROR, "Failed to lookup %s, " - "reason %s", sh->lookup_loc.path, - strerror (op_errno)); - goto done; - } - - if ((sh->lookup_flags & AFR_LOOKUP_FAIL_CONFLICTS) && - (afr_conflicting_iattrs (sh->buf, sh->success_children, - priv->child_count, - sh->lookup_loc.path, this->name))) { - op_errno = EIO; - gf_log (this->name, GF_LOG_ERROR, "Conflicting entries " - "for %s", sh->lookup_loc.path); - goto done; - } - - if ((sh->lookup_flags & AFR_LOOKUP_FAIL_MISSING_GFIDS) && - (afr_gfid_missing_count (this->name, sh->success_children, - sh->buf, priv->child_count, - sh->lookup_loc.path))) { - op_errno = ENODATA; - gf_log (this->name, GF_LOG_ERROR, "Missing Gfids " - "for %s", sh->lookup_loc.path); - goto done; - } - op_ret = 0; - -done: - sh->lookup_done (frame, this, op_ret, op_errno); -out: - return 0; + loc_wipe (&loc); + + return 0; } + int -afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child, - int32_t op_ret, int32_t op_errno) +afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) { - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - GF_ASSERT (sh->post_remove_call); - if ((op_ret == -1) && (op_errno != ENOENT)) { - gf_log (this->name, GF_LOG_ERROR, - "purge entry %s failed, on child %d reason, %s", - local->loc.path, child, strerror (op_errno)); - LOCK (&frame->lock); - { - afr_sh_set_error (sh, EIO); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } - UNLOCK (&frame->lock); - } - call_count = afr_frame_return (frame); - if (call_count == 0) - sh->post_remove_call (frame, this); - return 0; -} + loc_t loc = {0,}; -void -afr_sh_call_entry_expunge_remove (call_frame_t *frame, xlator_t *this, - int child_index, struct iatt *buf, - struct iatt *parentbuf, - afr_expunge_done_cbk_t expunge_done) -{ - call_frame_t *expunge_frame = NULL; - afr_local_t *local = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *expunge_sh = NULL; - int32_t op_errno = 0; - int ret = 0; - - expunge_frame = copy_frame (frame); - if (!expunge_frame) { - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out); - - local = frame->local; - sh = &local->self_heal; - expunge_frame->local = expunge_local; - expunge_sh = &expunge_local->self_heal; - expunge_sh->sh_frame = frame; - loc_copy (&expunge_local->loc, &local->loc); - ret = afr_build_parent_loc (&expunge_sh->parent_loc, - &expunge_local->loc, &op_errno); - if (ret) { - ret = -op_errno; - goto out; - } - sh->expunge_done = expunge_done; - afr_sh_entry_expunge_remove (expunge_frame, this, child_index, buf, - parentbuf); - return; -out: - gf_log (this->name, GF_LOG_ERROR, "Expunge of %s failed, reason: %s", - local->loc.path, strerror (op_errno)); - expunge_done (frame, this, child_index, -1, op_errno); -} + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); -void -afr_sh_remove_stale_lookup_info (afr_self_heal_t *sh, int32_t *success_children, - int32_t *fresh_children, - unsigned int child_count) -{ - int i = 0; - - for (i = 0; i < child_count; i++) { - if (afr_is_child_present (success_children, child_count, i) && - !afr_is_child_present (fresh_children, child_count, i)) { - sh->child_errno[i] = ENOENT; - GF_ASSERT (sh->xattr[i]); - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } - } -} + AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom, + &loc, name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); -int -afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - afr_sh_missing_entries_finish (frame, this); - } else { - if (afr_gfid_missing_count (this->name, sh->fresh_children, - sh->buf, priv->child_count, - local->loc.path)) { - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_missing_entries_lookup_done, - sh->sh_gfid_req, - AFR_LOOKUP_FAIL_CONFLICTS| - AFR_LOOKUP_FAIL_MISSING_GFIDS, - NULL); - } else { - //No need to set gfid so goto missing entries lookup done - //Behave as if you have done the lookup - afr_sh_remove_stale_lookup_info (sh, - sh->success_children, - sh->fresh_children, - priv->child_count); - afr_children_copy (sh->success_children, - sh->fresh_children, - priv->child_count); - afr_sh_missing_entries_lookup_done (frame, this, 0, 0); - } - } - return 0; + loc_wipe (&loc); + + return afr_selfheal_locked_fill (frame, this, locked_on); } -gf_boolean_t -afr_sh_purge_entry_condition (afr_local_t *local, afr_private_t *priv, - int child) + +int +afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) { - afr_self_heal_t *sh = NULL; + loc_t loc = {0,}; + afr_local_t *local = NULL; + int i = 0; + afr_private_t *priv = NULL; - sh = &local->self_heal; + priv = this->private; + local = frame->local; - if (local->child_up[child] && - (!afr_is_child_present (sh->fresh_parent_dirs, priv->child_count, - child)) - && (sh->child_errno[child] != ENOENT)) - return _gf_true; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - return _gf_false; -} + AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, + name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); -gf_boolean_t -afr_sh_purge_stale_entry_condition (afr_local_t *local, afr_private_t *priv, - int child) -{ - afr_self_heal_t *sh = NULL; + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == -1 && + local->replies[i].op_errno == EAGAIN) { + afr_selfheal_locked_fill (frame, this, locked_on); + afr_selfheal_unentrylk (frame, this, inode, dom, name, + locked_on); - sh = &local->self_heal; + AFR_SEQ (frame, afr_selfheal_lock_cbk, entrylk, dom, + &loc, name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + break; + } + } - if (local->child_up[child] && - (!afr_is_child_present (sh->fresh_children, priv->child_count, - child)) - && (sh->child_errno[child] != ENOENT)) - return _gf_true; + loc_wipe (&loc); - return _gf_false; + return afr_selfheal_locked_fill (frame, this, locked_on); } -void -afr_sh_purge_entry_common (call_frame_t *frame, xlator_t *this, - gf_boolean_t purge_condition (afr_local_t *local, - afr_private_t *priv, - int child)) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - int call_count = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (purge_condition (local, priv, i)) - call_count++; - } - - if (call_count == 0) { - sh->post_remove_call (frame, this); - goto out; - } - - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (!purge_condition (local, priv, i)) - continue; - gf_log (this->name, GF_LOG_INFO, "purging the stale entry %s " - "on %s", local->loc.path, priv->children[i]->name); - afr_sh_call_entry_expunge_remove (frame, this, - (long) i, &sh->buf[i], - &sh->parentbufs[i], - afr_sh_remove_entry_cbk); - } -out: - return; -} -void -afr_sh_purge_entry (call_frame_t *frame, xlator_t *this) +int +afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + loc_t loc = {0,}; + + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); + + AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, entrylk, + dom, &loc, name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); - local = frame->local; - sh = &local->self_heal; - sh->post_remove_call = afr_sh_missing_entries_finish; + loc_wipe (&loc); - afr_sh_purge_entry_common (frame, this, afr_sh_purge_entry_condition); + return 0; } -void -afr_sh_purge_stale_entry (call_frame_t *frame, xlator_t *this) + +gf_boolean_t +afr_is_pending_set (xlator_t *this, dict_t *xdata, int type) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; + int idx = -1; + afr_private_t *priv = NULL; + void *pending_raw = NULL; + int *pending_int = NULL; + int i = 0; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + priv = this->private; + idx = afr_index_for_transaction_type (type); - sh->post_remove_call = afr_sh_purge_stale_entries_done; + if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw) == 0) { + if (pending_raw) { + pending_int = pending_raw; - for (i = 0; i < priv->child_count; i++) { - if (afr_is_child_present (sh->fresh_children, - priv->child_count, i)) - continue; + if (ntoh32 (pending_int[idx])) + return _gf_true; + } + } - if ((!local->child_up[i]) || sh->child_errno[i] != 0) - continue; + for (i = 0; i < priv->child_count; i++) { + if (dict_get_ptr (xdata, priv->pending_key[i], + &pending_raw)) + continue; + if (!pending_raw) + continue; + pending_int = pending_raw; - GF_ASSERT (!uuid_is_null (sh->entrybuf.ia_gfid) || - uuid_is_null (sh->buf[i].ia_gfid)); + if (ntoh32 (pending_int[idx])) + return _gf_true; + } - if ((sh->entrybuf.ia_type != sh->buf[i].ia_type) || - (uuid_compare (sh->buf[i].ia_gfid, - sh->entrybuf.ia_gfid))) - continue; + return _gf_false; +} - afr_children_add_child (sh->fresh_children, i, - priv->child_count); - } - afr_sh_purge_entry_common (frame, this, - afr_sh_purge_stale_entry_condition); +gf_boolean_t +afr_is_data_set (xlator_t *this, dict_t *xdata) +{ + return afr_is_pending_set (this, xdata, AFR_DATA_TRANSACTION); } -void -afr_sh_save_child_iatts_from_policy (int32_t *children, struct iatt *bufs, - struct iatt *save, - unsigned int child_count) +gf_boolean_t +afr_is_metadata_set (xlator_t *this, dict_t *xdata) { - int i = 0; - int child = 0; - gf_boolean_t saved = _gf_false; - - GF_ASSERT (save); - //if iatt buf with gfid exists sets it - for (i = 0; i < child_count; i++) { - child = children[i]; - if (child == -1) - break; - *save = bufs[child]; - saved = _gf_true; - if (!uuid_is_null (save->ia_gfid)) - break; - } - GF_ASSERT (saved); + return afr_is_pending_set (this, xdata, AFR_METADATA_TRANSACTION); } -void -afr_get_children_of_fresh_parent_dirs (afr_self_heal_t *sh, - unsigned int child_count) +gf_boolean_t +afr_is_entry_set (xlator_t *this, dict_t *xdata) { - afr_children_intersection_get (sh->success_children, - sh->fresh_parent_dirs, - sh->sources, child_count); - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, child_count); - memset (sh->sources, 0, sizeof (*sh->sources) * child_count); + return afr_is_pending_set (this, xdata, AFR_ENTRY_TRANSACTION); } + void -afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) +afr_inode_link (inode_t *inode, struct iatt *iatt) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int32_t fresh_child_enoents = 0; - int32_t fresh_parent_count = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (op_ret < 0) - goto fail; - afr_get_children_of_fresh_parent_dirs (sh, priv->child_count); - fresh_parent_count = afr_get_children_count (sh->fresh_parent_dirs, - priv->child_count); - //we need the enoent count of the subvols present in fresh_parent_dirs - fresh_child_enoents = afr_errno_count (sh->fresh_parent_dirs, - sh->child_errno, - priv->child_count, ENOENT); - if (fresh_child_enoents == fresh_parent_count) { - afr_sh_set_error (sh, ENOENT); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_purge_entry (frame, this); - } else if (!afr_conflicting_iattrs (sh->buf, sh->fresh_children, - priv->child_count, local->loc.path, - this->name)) { - afr_sh_save_child_iatts_from_policy (sh->fresh_children, - sh->buf, &sh->entrybuf, - priv->child_count); - afr_update_gfid_from_iatts (sh->sh_gfid_req, sh->buf, - sh->fresh_children, - priv->child_count); - afr_sh_purge_stale_entry (frame, this); - } else { - op_errno = EIO; - afr_set_local_for_unhealable (local); - goto fail; - } - - return; - -fail: - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_set_error (sh, op_errno); - afr_sh_missing_entries_finish (frame, this); - return; -} + inode_t *linked_inode = NULL; -static void -afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int enoent_count = 0; - int nsources = 0; - int source = -1; - int32_t subvol_status = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (op_ret < 0) - goto out; - enoent_count = afr_errno_count (NULL, sh->child_errno, - priv->child_count, ENOENT); - if (enoent_count > 0) { - gf_log (this->name, GF_LOG_INFO, "Parent dir missing for %s," - " in missing entry self-heal, aborting missing-entry " - "self-heal", - local->loc.path); - afr_sh_missing_entries_finish (frame, this); - return; - } - - nsources = afr_build_sources (this, sh->xattr, sh->buf, - sh->pending_matrix, sh->sources, - sh->success_children, - AFR_ENTRY_TRANSACTION, &subvol_status, - _gf_true); - if ((subvol_status & ALL_FOOLS) || - (subvol_status & SPLIT_BRAIN)) { - gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative " - "merge", sh->parent_loc.path); - afr_mark_success_children_sources (sh->sources, - sh->success_children, - priv->child_count); - } else if (nsources < 0) { - gf_log (this->name, GF_LOG_ERROR, "No sources for dir " - "of %s, in missing entry self-heal, aborting " - "self-heal", local->loc.path); - op_errno = EIO; - goto out; - } - - source = afr_sh_select_source (sh->sources, priv->child_count); - if (source == -1) { - GF_ASSERT (0); - gf_log (this->name, GF_LOG_DEBUG, "No active sources found."); - op_errno = EIO; - goto out; - } - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_parent_dirs, priv->child_count); - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_children_lookup_done, NULL, 0, - NULL); - return; + linked_inode = inode_link (inode, NULL, NULL, iatt); -out: - afr_sh_set_error (sh, op_errno); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_missing_entries_finish (frame, this); - return; -} + uuid_copy (inode->gfid, iatt->ia_gfid); + inode->ia_type = iatt->ia_type; -void -afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count) -{ - int i = 0; - - for (i = 0; i < child_count; i++) { - memset (&sh->buf[i], 0, sizeof (sh->buf[i])); - memset (&sh->parentbufs[i], 0, sizeof (sh->parentbufs[i])); - sh->child_errno[i] = 0; - } - memset (&sh->parentbuf, 0, sizeof (sh->parentbuf)); - sh->success_count = 0; - afr_reset_children (sh->success_children, child_count); - afr_reset_children (sh->fresh_children, child_count); - afr_reset_xattr (sh->xattr, child_count); - loc_wipe (&sh->lookup_loc); + if (linked_inode) { + inode_lookup (linked_inode); + inode_unref (linked_inode); + } } -/* afr self-heal state will be lost if this call is made - * please check the afr_sh_common_reset that is called in this function + +/* + * This function inspects the looked up replies (in an unlocked manner) + * and decides whether a locked verification and possible healing is + * required or not. It updates the three booleans for each type + * of healing. If the boolean flag gets set to FALSE, then we are sure + * no healing is required. If the boolean flag gets set to TRUE then + * we have to proceed with locked reinspection. */ + int -afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - afr_lookup_done_cbk_t lookup_done , uuid_t gfid, - int32_t flags, dict_t *xdata) +afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, + inode_t *inode, uuid_t gfid, + gf_boolean_t *data_selfheal, + gf_boolean_t *metadata_selfheal, + gf_boolean_t *entry_selfheal) { - afr_local_t *local = NULL; - int i = 0; - int call_count = 0; - afr_private_t *priv = NULL; - dict_t *xattr_req = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - - call_count = afr_up_children_count (local->child_up, priv->child_count); - - local->call_count = call_count; - - xattr_req = dict_new(); - - if (xattr_req) { - afr_xattr_req_prepare (this, xattr_req, loc->path); - if (gfid) { - gf_log (this->name, GF_LOG_DEBUG, - "looking up %s with gfid: %s", - loc->path, uuid_utoa (gfid)); - GF_ASSERT (!uuid_is_null (gfid)); - afr_set_dict_gfid (xattr_req, gfid); - } - } - - afr_sh_common_reset (sh, priv->child_count); - sh->lookup_done = lookup_done; - loc_copy (&sh->lookup_loc, loc); - sh->lookup_flags = flags; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - gf_log (this->name, GF_LOG_DEBUG, - "looking up %s on subvolume %s", - loc->path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, - afr_sh_common_lookup_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - loc, xattr_req); - - if (!--call_count) - break; - } - } - - if (xattr_req) - dict_unref (xattr_req); - - return 0; -} + afr_private_t *priv = NULL; + int i = 0; + int valid_cnt = 0; + struct iatt first = {0, }; + struct afr_reply *replies = NULL; + int ret = -1; + priv = this->private; + replies = alloca0 (sizeof (*replies) * priv->child_count); -int -afr_sh_post_nb_entrylk_missing_entry_sh_cbk (call_frame_t *frame, - xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "Non blocking entrylks failed."); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_missing_entries_done (frame, this); - } else { - - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking entrylks done. Proceeding to FOP"); - afr_sh_common_lookup (frame, this, &sh->parent_loc, - afr_sh_find_fresh_parents, - NULL, AFR_LOOKUP_FAIL_CONFLICTS, - NULL); - } - - return 0; -} + ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies); + if (ret) + return ret; -int -afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, - char *base_name, afr_lock_cbk_t lock_cbk) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + if (replies[i].op_ret == -1) + continue; - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; + if (afr_is_data_set (this, replies[i].xdata)) + *data_selfheal = _gf_true; - int_lock->transaction_lk_type = AFR_SELFHEAL_LK; - int_lock->selfheal_lk_type = AFR_ENTRY_SELF_HEAL_LK; + if (afr_is_metadata_set (this, replies[i].xdata)) + *metadata_selfheal = _gf_true; - afr_set_lock_number (frame, this); + if (afr_is_entry_set (this, replies[i].xdata)) + *entry_selfheal = _gf_true; - int_lock->lk_basename = base_name; - int_lock->lk_loc = loc; - int_lock->lock_cbk = lock_cbk; - int_lock->domain = this->name; + valid_cnt ++; + if (valid_cnt == 1) { + first = replies[i].poststat; + continue; + } - int_lock->lockee_count = 0; - afr_init_entry_lockee (&int_lock->lockee[0], local, loc, - base_name, priv->child_count); - int_lock->lockee_count++; - afr_nonblocking_entrylk (frame, this); + if (!IA_EQUAL (first, replies[i].poststat, type)) { + gf_log (this->name, GF_LOG_ERROR, + "TYPE mismatch %d vs %d on %s for gfid:%s", + (int) first.ia_type, + (int) replies[i].poststat.ia_type, + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); + return -EIO; + } - return 0; -} + if (!IA_EQUAL (first, replies[i].poststat, uid)) { + gf_log (this->name, GF_LOG_DEBUG, + "UID mismatch %d vs %d on %s for gfid:%s", + (int) first.ia_uid, + (int) replies[i].poststat.ia_uid, + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); -static int -afr_self_heal_parent_entrylk (call_frame_t *frame, xlator_t *this, - afr_lock_cbk_t lock_cbk) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_internal_lock_t *int_lock = NULL; - int ret = -1; - int32_t op_errno = 0; - - local = frame->local; - sh = &local->self_heal; - - gf_log (this->name, GF_LOG_TRACE, - "attempting to recreate missing entries for path=%s", - local->loc.path); - - ret = afr_build_parent_loc (&sh->parent_loc, &local->loc, &op_errno); - if (ret) - goto out; - - afr_sh_entrylk (frame, this, &sh->parent_loc, NULL, - lock_cbk); - return 0; -out: - int_lock = &local->internal_lock; - int_lock->lock_op_ret = -1; - lock_cbk (frame, this); - return 0; -} + *metadata_selfheal = _gf_true; + } -static int -afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + if (!IA_EQUAL (first, replies[i].poststat, gid)) { + gf_log (this->name, GF_LOG_DEBUG, + "GID mismatch %d vs %d on %s for gfid:%s", + (int) first.ia_uid, + (int) replies[i].poststat.ia_uid, + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); - local = frame->local; - sh = &local->self_heal; + *metadata_selfheal = _gf_true; + } - sh->sh_type_in_action = AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY; + if (!IA_EQUAL (first, replies[i].poststat, prot)) { + gf_log (this->name, GF_LOG_DEBUG, + "MODE mismatch %d vs %d on %s for gfid:%s", + (int) st_mode_from_ia (first.ia_prot, 0), + (int) st_mode_from_ia (replies[i].poststat.ia_prot, 0), + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); + *metadata_selfheal = _gf_true; + } - afr_self_heal_parent_entrylk (frame, this, - afr_sh_post_nb_entrylk_missing_entry_sh_cbk); - return 0; -} + if (IA_ISREG(first.ia_type) && + !IA_EQUAL (first, replies[i].poststat, size)) { + gf_log (this->name, GF_LOG_DEBUG, + "SIZE mismatch %lld vs %lld on %s for gfid:%s", + (long long) first.ia_size, + (long long) replies[i].poststat.ia_size, + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); -afr_local_t* -afr_self_heal_local_init (afr_local_t *l, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *lc = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *shc = NULL; - int ret = 0; - - priv = this->private; - - sh = &l->self_heal; - - lc = mem_get0 (this->local_pool); - if (!lc) - goto out; - - shc = &lc->self_heal; - - shc->unwind = sh->unwind; - shc->gfid_sh_success_cbk = sh->gfid_sh_success_cbk; - shc->do_missing_entry_self_heal = sh->do_missing_entry_self_heal; - shc->do_gfid_self_heal = sh->do_gfid_self_heal; - shc->do_data_self_heal = sh->do_data_self_heal; - shc->do_metadata_self_heal = sh->do_metadata_self_heal; - shc->do_entry_self_heal = sh->do_entry_self_heal; - shc->force_confirm_spb = sh->force_confirm_spb; - shc->forced_merge = sh->forced_merge; - shc->background = sh->background; - shc->type = sh->type; - shc->data_sh_info = ""; - shc->metadata_sh_info = ""; - - uuid_copy (shc->sh_gfid_req, sh->sh_gfid_req); - if (l->loc.path) { - ret = loc_copy (&lc->loc, &l->loc); - if (ret < 0) - goto out; - } - - lc->child_up = memdup (l->child_up, - sizeof (*lc->child_up) * priv->child_count); - if (!lc->child_up) { - ret = -1; - goto out; - } - - if (l->xattr_req) - lc->xattr_req = dict_ref (l->xattr_req); - - if (l->cont.lookup.inode) - lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode); - if (l->cont.lookup.xattr) - lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr); - - lc->internal_lock.locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), - priv->child_count, gf_afr_mt_char); - if (!lc->internal_lock.locked_nodes) { - ret = -1; - goto out; - } - - ret = afr_inodelk_init (&lc->internal_lock.inodelk[0], - this->name, priv->child_count); - if (ret) - goto out; + *data_selfheal = _gf_true; + } + } -out: - if (ret) { - afr_local_cleanup (lc, this); - lc = NULL; - } - return lc; -} + if (valid_cnt > 0) + afr_inode_link (inode, &first); -int -afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_local_t * orig_frame_local = NULL; - afr_self_heal_t * orig_frame_sh = NULL; - char sh_type_str[256] = {0,}; - gf_loglevel_t loglevel = 0; - - priv = this->private; - local = bgsh_frame->local; - sh = &local->self_heal; - - if (local->unhealable) { - afr_set_split_brain (this, sh->inode, SPB, SPB); - } - - afr_self_heal_type_str_get (sh, sh_type_str, - sizeof(sh_type_str)); - if (is_self_heal_failed (sh, AFR_CHECK_ALL) && !priv->shd.iamshd) { - loglevel = GF_LOG_ERROR; - } else if (!is_self_heal_failed (sh, AFR_CHECK_ALL)) { - loglevel = GF_LOG_INFO; - } else { - loglevel = GF_LOG_DEBUG; - } - - afr_log_self_heal_completion_status (local, loglevel); - - FRAME_SU_UNDO (bgsh_frame, afr_local_t); - - if (!sh->unwound && sh->unwind) { - orig_frame_local = sh->orig_frame->local; - orig_frame_sh = &orig_frame_local->self_heal; - orig_frame_sh->actual_sh_started = _gf_true; - sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, - is_self_heal_failed (sh, AFR_CHECK_ALL)); - } - - if (sh->background) { - LOCK (&priv->lock); - { - priv->background_self_heals_started--; - } - UNLOCK (&priv->lock); - } - - AFR_STACK_DESTROY (bgsh_frame); - - return 0; -} - -int -afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int32_t op_errno = 0; - int ret = 0; - afr_self_heal_t *orig_sh = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t *sh_local = NULL; - loc_t *loc = NULL; - - local = frame->local; - orig_sh = &local->self_heal; - priv = this->private; - - GF_ASSERT (local->loc.path); - - gf_log (this->name, GF_LOG_TRACE, - "performing self heal on %s (metadata=%d data=%d entry=%d)", - local->loc.path, - local->self_heal.do_metadata_self_heal, - local->self_heal.do_data_self_heal, - local->self_heal.do_entry_self_heal); - - op_errno = ENOMEM; - sh_frame = copy_frame (frame); - if (!sh_frame) - goto out; - afr_set_lk_owner (sh_frame, this, sh_frame->root); - afr_set_low_priority (sh_frame); - - sh_local = afr_self_heal_local_init (local, this); - if (!sh_local) - goto out; - sh_frame->local = sh_local; - sh = &sh_local->self_heal; - - sh->inode = inode_ref (inode); - sh->orig_frame = frame; - - sh->completion_cbk = afr_self_heal_completion_cbk; - - sh->success = GF_CALLOC (priv->child_count, sizeof (*sh->success), - gf_afr_mt_char); - if (!sh->success) - goto out; - sh->sources = GF_CALLOC (sizeof (*sh->sources), priv->child_count, - gf_afr_mt_int); - if (!sh->sources) - goto out; - sh->locked_nodes = GF_CALLOC (sizeof (*sh->locked_nodes), - priv->child_count, - gf_afr_mt_int); - if (!sh->locked_nodes) - goto out; - - sh->pending_matrix = afr_matrix_create (priv->child_count, - priv->child_count); - if (!sh->pending_matrix) - goto out; - - sh->delta_matrix = afr_matrix_create (priv->child_count, - priv->child_count); - if (!sh->delta_matrix) - goto out; - - sh->fresh_parent_dirs = afr_children_create (priv->child_count); - if (!sh->fresh_parent_dirs) - goto out; - ret = afr_sh_common_create (sh, priv->child_count); - if (ret) { - op_errno = -ret; - goto out; - } - - if (local->self_heal.background) { - LOCK (&priv->lock); - { - if (priv->background_self_heals_started - < priv->background_self_heal_count) { - priv->background_self_heals_started++; - - - } else { - local->self_heal.background = _gf_false; - sh->background = _gf_false; - } - } - UNLOCK (&priv->lock); - } - - if (!local->loc.parent) { - sh->do_missing_entry_self_heal = _gf_false; - sh->do_gfid_self_heal = _gf_false; - } - - sh->sh_type_in_action = AFR_SELF_HEAL_INVALID; - - FRAME_SU_DO (sh_frame, afr_local_t); - if (sh->do_missing_entry_self_heal || sh->do_gfid_self_heal) { - afr_self_heal_missing_entries (sh_frame, this); - } else { - loc = &sh_local->loc; - if (uuid_is_null (loc->inode->gfid) && uuid_is_null (loc->gfid)) { - if (!uuid_is_null (inode->gfid)) - GF_ASSERT (!uuid_compare (inode->gfid, - sh->sh_gfid_req)); - uuid_copy (loc->gfid, sh->sh_gfid_req); - } - gf_log (this->name, GF_LOG_TRACE, - "proceeding to metadata check on %s", - local->loc.path); - - afr_sh_missing_entries_done (sh_frame, this); - } - op_errno = 0; + if (valid_cnt < 2) + return -ENOTCONN; -out: - if (op_errno) { - orig_sh->unwind (frame, this, -1, op_errno, 1); - if (sh_frame) - AFR_STACK_DESTROY (sh_frame); - } - return 0; + return 0; } -void -afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, - size_t size) + +inode_t * +afr_inode_find (xlator_t *this, uuid_t gfid) { - GF_ASSERT (str && (size > strlen (" missing-entry gfid " - "meta-data data entry"))); + inode_table_t *table = NULL; + inode_t *inode = NULL; - if (self_heal_p->do_metadata_self_heal) { - snprintf (str, size, " meta-data"); - } + table = this->itable; + if (!table) + return NULL; - if (self_heal_p->do_data_self_heal) { - snprintf (str + strlen(str), size - strlen(str), " data"); - } + inode = inode_find (table, gfid); + if (inode) + return inode; - if (self_heal_p->do_entry_self_heal) { - snprintf (str + strlen(str), size - strlen(str), " entry"); - } + inode = inode_new (table); + if (!inode) + return NULL; - if (self_heal_p->do_missing_entry_self_heal) { - snprintf (str + strlen(str), size - strlen(str), - " missing-entry"); - } + uuid_copy (inode->gfid, gfid); - if (self_heal_p->do_gfid_self_heal) { - snprintf (str + strlen(str), size - strlen(str), " gfid"); - } + return inode; } -afr_self_heal_type -afr_self_heal_type_for_transaction (afr_transaction_type type) -{ - afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; - - switch (type) { - case AFR_DATA_TRANSACTION: - sh_type = AFR_SELF_HEAL_DATA; - break; - case AFR_METADATA_TRANSACTION: - sh_type = AFR_SELF_HEAL_METADATA; - break; - case AFR_ENTRY_TRANSACTION: - sh_type = AFR_SELF_HEAL_ENTRY; - break; - case AFR_ENTRY_RENAME_TRANSACTION: - GF_ASSERT (0); - break; - } - return sh_type; -} -int -afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +call_frame_t * +afr_frame_create (xlator_t *this) { - int ret = -1; - uuid_t pargfid = {0}; - - if (!child) - goto out; - - if (!uuid_is_null (parent->inode->gfid)) - uuid_copy (pargfid, parent->inode->gfid); - else if (!uuid_is_null (parent->gfid)) - uuid_copy (pargfid, parent->gfid); - - if (uuid_is_null (pargfid)) - goto out; - - if (strcmp (parent->path, "/") == 0) - ret = gf_asprintf ((char **)&child->path, "/%s", name); - else - ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path, - name); + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + int op_errno = 0; + pid_t pid = -1; - if (-1 == ret) { - gf_log (this->name, GF_LOG_ERROR, - "asprintf failed while setting child path"); - } + frame = create_frame (this, this->ctx->pool); + if (!frame) + return NULL; - child->name = strrchr (child->path, '/'); - if (child->name) - child->name++; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) { + STACK_DESTROY (frame->root); + return NULL; + } - child->parent = inode_ref (parent->inode); - child->inode = inode_new (parent->inode->table); - uuid_copy (child->pargfid, pargfid); + syncopctx_setfspid (&pid); - if (!child->inode) { - ret = -1; - goto out; - } + frame->root->pid = pid; - ret = 0; -out: - if ((ret == -1) && child) - loc_wipe (child); + afr_set_lk_owner (frame, this, frame->root); - return ret; + return frame; } -int -afr_sh_erase_pending (call_frame_t *frame, xlator_t *this, - afr_transaction_type type, afr_fxattrop_cbk_t cbk, - int (*finish)(call_frame_t *frame, xlator_t *this)) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; - int ret = -1; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, - sh->success, priv->child_count, type); - - erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, - gf_afr_mt_dict_t); - if (!erase_xattr) - goto out; - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; - erase_xattr[i] = dict_new (); - if (!erase_xattr[i]) - goto out; - } - } - - afr_sh_delta_to_xattr (this, sh->delta_matrix, erase_xattr, - priv->child_count, type); - - gf_log (this->name, GF_LOG_DEBUG, "Delta matrix for: %s", - lkowner_utoa (&frame->root->lk_owner)); - afr_sh_print_pending_matrix (sh->delta_matrix, this); - local->call_count = call_count; - if (call_count == 0) { - ret = 0; - finish (frame, this); - goto out; - } - - for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) - continue; - - if (sh->healing_fd) {//true for ENTRY, reg file DATA transaction - STACK_WIND_COOKIE (frame, cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - sh->healing_fd, - GF_XATTROP_ADD_ARRAY, erase_xattr[i], - NULL); - } else { - STACK_WIND_COOKIE (frame, cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, erase_xattr[i], - NULL); - } - } - - ret = 0; -out: - if (erase_xattr) { - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); - } - } - } - - GF_FREE (erase_xattr); - - if (ret < 0) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - finish (frame, this); - } - - return 0; -} -void -afr_set_self_heal_status(afr_self_heal_t *sh, afr_self_heal_status status) -{ - xlator_t *this = NULL; - afr_sh_status_for_all_type *sh_status = &(sh->afr_all_sh_status); - afr_self_heal_type sh_type_in_action = sh->sh_type_in_action; - this = THIS; - - if (!sh) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal" - "Structure"); - goto out; - } - - switch (sh_type_in_action) { - case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: - sh_status->gfid_or_missing_entry_self_heal = status; - break; - case AFR_SELF_HEAL_METADATA: - sh_status->metadata_self_heal = status; - break; - case AFR_SELF_HEAL_DATA: - sh_status->data_self_heal = status; - break; - case AFR_SELF_HEAL_ENTRY: - sh_status->entry_self_heal = status; - break; - case AFR_SELF_HEAL_INVALID: - gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid" - "self heal type in action"); - break; - } -out: - return; -} +/* + * This is the entry point for healing a given GFID + */ -void -afr_set_local_for_unhealable (afr_local_t *local) +int +afr_selfheal (xlator_t *this, uuid_t gfid) { - afr_self_heal_t *sh = NULL; - - sh = &local->self_heal; + inode_t *inode = NULL; + call_frame_t *frame = NULL; + int ret = -1; + gf_boolean_t data_selfheal = _gf_false; + gf_boolean_t metadata_selfheal = _gf_false; + gf_boolean_t entry_selfheal = _gf_false; - local->unhealable = 1; - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -} + inode = afr_inode_find (this, gfid); + if (!inode) + goto out; -int -is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type) -{ - afr_sh_status_for_all_type sh_status = sh->afr_all_sh_status; - afr_self_heal_type sh_type_in_action = AFR_SELF_HEAL_INVALID; - afr_self_heal_status status = AFR_SELF_HEAL_FAILED; - xlator_t *this = NULL; - int sh_failed = 0; - - this = THIS; - - if (!sh) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal " - "structure"); - sh_failed = 1; - goto out; - } - - if (type == AFR_CHECK_ALL) { - if ((sh_status.gfid_or_missing_entry_self_heal == AFR_SELF_HEAL_FAILED) - || (sh_status.metadata_self_heal == AFR_SELF_HEAL_FAILED) - || (sh_status.data_self_heal == AFR_SELF_HEAL_FAILED) - || (sh_status.entry_self_heal == AFR_SELF_HEAL_FAILED)) - sh_failed = 1; - } else if (type == AFR_CHECK_SPECIFIC) { - sh_type_in_action = sh->sh_type_in_action; - switch (sh_type_in_action) { - case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: - status = sh_status.gfid_or_missing_entry_self_heal; - break; - case AFR_SELF_HEAL_METADATA: - status = sh_status.metadata_self_heal; - break; - case AFR_SELF_HEAL_ENTRY: - status = sh_status.entry_self_heal; - break; - case AFR_SELF_HEAL_DATA: - status = sh_status.data_self_heal; - break; - case AFR_SELF_HEAL_INVALID: - status = AFR_SELF_HEAL_NOT_ATTEMPTED; - break; - } - if (status == AFR_SELF_HEAL_FAILED) - sh_failed = 1; - - } + frame = afr_frame_create (this); + if (!frame) + goto out; -out: - return sh_failed; -} + ret = afr_selfheal_unlocked_inspect (frame, this, inode, gfid, + &data_selfheal, + &metadata_selfheal, + &entry_selfheal); + if (ret) + goto out; -char * -get_sh_completion_status (afr_self_heal_status status) -{ + if (data_selfheal) + afr_selfheal_data (frame, this, inode); - char *not_attempted = " is not attempted"; - char *failed = " failed"; - char *started = " is started"; - char *sync_begin = " is successfully completed"; - char *result = " has unknown status"; - - switch (status) - { - case AFR_SELF_HEAL_NOT_ATTEMPTED: - result = not_attempted; - break; - case AFR_SELF_HEAL_FAILED: - result = failed; - break; - case AFR_SELF_HEAL_STARTED: - result = started; - break; - case AFR_SELF_HEAL_SYNC_BEGIN: - result = sync_begin; - break; - } - - return result; + if (metadata_selfheal) + afr_selfheal_metadata (frame, this, inode); -} + if (entry_selfheal) + afr_selfheal_entry (frame, this, inode); -void -afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t loglvl) -{ + inode_forget (inode, 1); +out: + if (inode) + inode_unref (inode); + if (frame) + AFR_STACK_DESTROY (frame); - char sh_log[4096] = {0}; - afr_self_heal_t *sh = &local->self_heal; - afr_sh_status_for_all_type all_status = sh->afr_all_sh_status; - xlator_t *this = NULL; - size_t off = 0; - int data_sh = 0; - int metadata_sh = 0; - int print_log = 0; - - this = THIS; - - ADD_FMT_STRING (sh_log, off, "gfid or missing entry", - all_status.gfid_or_missing_entry_self_heal, print_log); - ADD_FMT_STRING_SYNC (sh_log, off, "metadata", - all_status.metadata_self_heal, print_log); - if (sh->background) { - ADD_FMT_STRING_SYNC (sh_log, off, "backgroung data", - all_status.data_self_heal, print_log); - } else { - ADD_FMT_STRING_SYNC (sh_log, off, "foreground data", - all_status.data_self_heal, print_log); - } - ADD_FMT_STRING_SYNC (sh_log, off, "entry", all_status.entry_self_heal, - print_log); - - if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.data_self_heal && - strcmp (sh->data_sh_info, "") && sh->data_sh_info ) - data_sh = 1; - if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.metadata_self_heal && - strcmp (sh->metadata_sh_info, "") && sh->metadata_sh_info) - metadata_sh = 1; - - if (!print_log) - return; - - gf_log (this->name, loglvl, "%s %s %s on %s", sh_log, - ((data_sh == 1) ? sh->data_sh_info : ""), - ((metadata_sh == 1) ? sh->metadata_sh_info : ""), - local->loc.path); + return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h deleted file mode 100644 index 4732647760d..00000000000 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ /dev/null @@ -1,144 +0,0 @@ -/* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef __AFR_SELF_HEAL_COMMON_H__ -#define __AFR_SELF_HEAL_COMMON_H__ - -#define FILE_HAS_HOLES(buf) (((buf)->ia_size) > ((buf)->ia_blocks * 512)) -#define AFR_SH_MIN_PARTICIPANTS 2 - -typedef enum { - AFR_LOOKUP_FAIL_CONFLICTS = 1, - AFR_LOOKUP_FAIL_MISSING_GFIDS = 2, -} afr_lookup_flags_t; - -int -afr_sh_select_source (int sources[], int child_count); - -int -afr_sh_source_count (int sources[], int child_count); - -void -afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this); - -void -afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, - const char *loc); - -int -afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, - unsigned char *ignorant_subvols, - dict_t *xattr[], afr_transaction_type type, - size_t child_count); - -void -afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, - int32_t *delta_matrix[], unsigned char success[], - int child_count, afr_transaction_type type); - -int -afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, - struct iatt *bufs, afr_self_heal_type type, - int32_t *success_children, int32_t *subvol_status); - -int -afr_sh_delta_to_xattr (xlator_t *this, - int32_t *delta_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type); - -void -afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, - size_t size); - -afr_self_heal_type -afr_self_heal_type_for_transaction (afr_transaction_type type); - -int -afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, - int32_t **pending_matrix, int32_t *sources, - int32_t *success_children, afr_transaction_type type, - int32_t *subvol_status, gf_boolean_t ignore_ignorant); -void -afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count); - -void -afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, - dict_t *xattr, struct iatt *postparent, - loc_t *loc); - -int -afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - afr_lookup_done_cbk_t lookup_cbk, uuid_t uuid, - int32_t flags, dict_t *xdata); -int -afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, - int active_src, struct iatt *buf, - struct iatt *parentbuf); -int -afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, - char *base_name, afr_lock_cbk_t lock_cbk); -int -afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, - int child_index); -int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, - afr_lock_cbk_t lock_cbk); -afr_local_t * -afr_self_heal_local_init (afr_local_t *l, xlator_t *this); -int -afr_sh_data_lock (call_frame_t *frame, xlator_t *this, - off_t start, off_t len, gf_boolean_t block, char *dom, - afr_lock_cbk_t success_handler, - afr_lock_cbk_t failure_handler); -void -afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno); -void -afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this); -typedef int -(*afr_fxattrop_cbk_t) (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr, dict_t *xdata); -int -afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name); -int -afr_impunge_frame_create (call_frame_t *frame, xlator_t *this, - int active_source, call_frame_t **impunge_frame); -void -afr_sh_reset (call_frame_t *frame, xlator_t *this); - -void -afr_children_intersection_get (int32_t *set1, int32_t *set2, - int *intersection, unsigned int child_count); -int -afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children, - struct iatt *bufs); -int -afr_sh_erase_pending (call_frame_t *frame, xlator_t *this, - afr_transaction_type type, afr_fxattrop_cbk_t cbk, - int (*finish)(call_frame_t *frame, xlator_t *this)); - -void -afr_set_local_for_unhealable (afr_local_t *local); - -int -is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type); - -void -afr_set_self_heal_status (afr_self_heal_t *sh, afr_self_heal_status status); - -void -afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t logl); - -char* -afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this); -#endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 9de26ee569c..c0385153ff5 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,1747 +8,609 @@ cases as published by the Free Software Foundation. */ -#include <libgen.h> -#include <unistd.h> -#include <fnmatch.h> -#include <sys/time.h> -#include <stdlib.h> -#include <signal.h> #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" #endif -#include "glusterfs.h" #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" -#include "afr-self-heal-algorithm.h" - -int -afr_sh_data_fail (call_frame_t *frame, xlator_t *this); - -static inline gf_boolean_t -afr_sh_data_proceed (unsigned int success_count) -{ - return (success_count >= AFR_SH_MIN_PARTICIPANTS); -} - -extern int -sh_loop_finish (call_frame_t *loop_frame, xlator_t *this); +#include "byte-order.h" -int -afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this); +enum { + AFR_SELFHEAL_DATA_FULL = 0, + AFR_SELFHEAL_DATA_DIFF, +}; -int -afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this); -int -afr_sh_data_finish (call_frame_t *frame, xlator_t *this); - -int -afr_sh_data_done (call_frame_t *frame, xlator_t *this) +#define HAS_HOLES(i) ((i->ia_blocks * 512) < (i->ia_size)) +static int +__checksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, uint32_t weak, uint8_t *strong, + dict_t *xdata) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + afr_local_t *local = NULL; + int i = (long) cookie; - local = frame->local; - sh = &local->self_heal; + local = frame->local; - sh->completion_cbk (frame, this); + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (strong) + memcpy (local->replies[i].checksum, strong, MD5_DIGEST_LENGTH); - return 0; + syncbarrier_wake (&local->barrier); + return 0; } -int -afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "flush failed on %s on subvolume %s: %s", - local->loc.path, priv->children[child_index]->name, - strerror (op_errno)); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_data_done (frame, this); - } - - return 0; -} - -int -afr_sh_data_close (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - int call_count = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (!sh->healing_fd) { - //This happens when file is non-reg - afr_sh_data_done (frame, this); - return 0; - } - call_count = afr_set_elem_count_get (sh->success, - priv->child_count); - local->call_count = call_count; - - if (call_count == 0) { - afr_sh_data_done (frame, this); - return 0; - } - - for (i = 0; i < priv->child_count; i++) { - if (!sh->success[i]) - continue; - gf_log (this->name, GF_LOG_DEBUG, - "closing fd of %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->flush, - sh->healing_fd, NULL); - - if (!--call_count) - break; - } - - return 0; -} - -int -afr_sh_dom_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (sh->sh_dom_lock_held) - afr_sh_data_unlock (frame, this, priv->sh_domain, - afr_sh_data_close); - else - afr_sh_data_close (frame, this); - return 0; -} - -int -afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *statpre, - struct iatt *statpost, dict_t *xdata) +static int +attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, struct iatt *post, + dict_t *xdata) { + int i = (long) cookie; + afr_local_t *local = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = (long) cookie; + local = frame->local; - local = frame->local; - priv = this->private; + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (pre) + local->replies[i].prestat = *pre; + if (post) + local->replies[i].poststat = *post; + if (xdata) + local->replies[i].xdata = dict_ref (xdata); - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "setattr failed on %s on subvolume %s: %s", - local->loc.path, priv->children[child_index]->name, - strerror (op_errno)); - } - } - UNLOCK (&frame->lock); + syncbarrier_wake (&local->barrier); - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_data_finish (frame, this); - } - - return 0; + return 0; } -int -afr_sh_data_setattr (call_frame_t *frame, xlator_t *this, struct iatt* stbuf) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - int call_count = 0; - int32_t valid = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - valid = (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME); - - call_count = afr_set_elem_count_get (sh->success, - priv->child_count); - local->call_count = call_count; - - if (call_count == 0) { - GF_ASSERT (0); - afr_sh_data_finish (frame, this); - return 0; - } - - for (i = 0; i < priv->child_count; i++) { - if (!sh->success[i]) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setattr, - &local->loc, stbuf, valid, NULL); - - if (!--call_count) - break; - } - - return 0; -} -int -afr_sh_data_setattr_fstat_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf, dict_t *xdata) +static gf_boolean_t +__afr_selfheal_data_checksums_match (call_frame_t *frame, xlator_t *this, + fd_t *fd, int source, + unsigned char *healed_sinks, + off_t offset, size_t size) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int child_index = (long) cookie; - - local = frame->local; - sh = &local->self_heal; - - GF_ASSERT (sh->source == child_index); - if (op_ret != -1) { - sh->buf[child_index] = *buf; - afr_sh_data_setattr (frame, this, buf); - } else { - gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " - "time-stamps after self-heal", local->loc.path); - afr_sh_data_fail (frame, this); - } - - return 0; -} + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + unsigned char *wind_subvols = NULL; + int i = 0; -/* - * If there are any writes after the self-heal is triggered then the - * stbuf stored in local->self_heal.buf[] will be invalid so we do one more - * stat on the source and then set the [am]times - */ -int -afr_sh_set_timestamps (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - STACK_WIND_COOKIE (frame, afr_sh_data_setattr_fstat_cbk, - (void *) (long) sh->source, - priv->children[sh->source], - priv->children[sh->source]->fops->fstat, - sh->healing_fd, NULL); - return 0; -} - -//Fun fact, lock_cbk is being used for both lock & unlock -int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, - afr_lock_cbk_t lock_cbk) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int ret = 0; - - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - priv = this->private; - - if (strcmp (dom, this->name) == 0) { - sh->data_lock_held = _gf_false; - } else if (strcmp (dom, priv->sh_domain) == 0) { - sh->sh_dom_lock_held = _gf_false; - } else { - ret = -1; - goto out; - } - int_lock->lock_cbk = lock_cbk; - int_lock->domain = dom; - afr_unlock (frame, this); - -out: - if (ret) { - int_lock->lock_op_ret = -1; - int_lock->lock_cbk (frame, this); - } - return 0; -} - -int -afr_sh_data_finish (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - gf_log (this->name, GF_LOG_DEBUG, - "finishing data selfheal of %s", local->loc.path); - - if (sh->data_lock_held) - afr_sh_data_unlock (frame, this, this->name, afr_sh_dom_unlock); - else - afr_sh_dom_unlock (frame, this); - - return 0; -} - -int -afr_sh_data_fail (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + priv = this->private; + local = frame->local; - local = frame->local; - sh = &local->self_heal; + wind_subvols = alloca0 (priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (i == source || healed_sinks[i]) + wind_subvols[i] = 1; + } - gf_log (this->name, GF_LOG_DEBUG, - "finishing failed data selfheal of %s", local->loc.path); + AFR_ONLIST (wind_subvols, frame, __checksum_cbk, rchecksum, fd, + offset, size, NULL); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_data_finish (frame, this); - return 0; -} + if (!local->replies[source].valid || local->replies[source].op_ret != 0) + return _gf_false; -int -afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr, dict_t *xdata) -{ - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int32_t child_index = (long) cookie; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - if (op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "Erasing of pending change " - "log failed on %s for subvol %s, reason: %s", - local->loc.path, priv->children[child_index]->name, - strerror (op_errno)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - if (sh->old_loop_frame) - sh_loop_finish (sh->old_loop_frame, this); - sh->old_loop_frame = NULL; - afr_sh_data_fail (frame, this); - goto out; - } - if (!IA_ISREG (sh->type)) { - afr_sh_data_finish (frame, this); - goto out; - } - GF_ASSERT (sh->old_loop_frame); - afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, - afr_post_sh_big_lock_success, - afr_post_sh_big_lock_failure); - } -out: - return 0; -} + for (i = 0; i < priv->child_count; i++) { + if (i == source) + continue; + if (memcmp (local->replies[source].checksum, + local->replies[i].checksum, + MD5_DIGEST_LENGTH)) + return _gf_false; + } -int -afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) -{ - afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION, - afr_sh_data_erase_pending_cbk, - afr_sh_data_finish); - return 0; + return _gf_true; } -int -afr_sh_data_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *pre, - struct iatt *post, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int call_count = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - - if (op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "%s: Failed to fsync on " - "%s - %s", local->loc.path, - priv->children[child_index]->name, strerror (op_errno)); - LOCK (&frame->lock); - { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } - UNLOCK (&frame->lock); - if (sh->old_loop_frame) - sh_loop_finish (sh->old_loop_frame, this); - sh->old_loop_frame = NULL; - } - - call_count = afr_frame_return (frame); - if (call_count == 0) { - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) - afr_sh_data_fail (frame, this); - else - afr_sh_data_erase_pending (frame, this); - } - return 0; -} -/* - * Before erasing xattrs, make sure the data is written to disk - */ -int -afr_sh_data_fsync (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - int call_count = 0; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - - call_count = sh->active_sinks; - if (call_count == 0) { - afr_sh_data_erase_pending (frame, this); - return 0; - } - - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (!sh->success[i] || sh->sources[i]) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_data_fsync_cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->fsync, - sh->healing_fd, 1, NULL); - } - - return 0; -} - -static struct afr_sh_algorithm * -sh_algo_from_name (xlator_t *this, char *name) +static int +__afr_selfheal_data_read_write (call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, + off_t offset, size_t size, + struct afr_reply *replies) { - int i = 0; + struct iovec *iovec = NULL; + int count = 0; + struct iobref *iobref = NULL; + int ret = 0; + int i = 0; + afr_private_t *priv = NULL; - if (name == NULL) - goto out; + priv = this->private; - while (afr_self_heal_algorithms[i].name) { - if (!strcmp (name, afr_self_heal_algorithms[i].name)) { - return &afr_self_heal_algorithms[i]; - } + ret = syncop_readv (priv->children[source], fd, size, offset, 0, + &iovec, &count, &iobref); + if (ret <= 0) + return ret; - i++; - } + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + + /* + * TODO: Use fiemap() and discard() to heal holes + * in the future. + * + * For now, + * + * - if the source had any holes at all, + * AND + * - if we are writing past the original file size + * of the sink + * AND + * - is NOT the last block of the source file. if + * the block contains EOF, it has to be written + * in order to set the file size even if the + * last block is 0-filled. + * AND + * - if the read buffer is filled with only 0's + * + * then, skip writing to this source. We don't depend + * on the write to happen to update the size as we + * have performed an ftruncate() upfront anyways. + */ +#define is_last_block(o,b,s) ((s >= o) && (s <= (o + b))) + if (HAS_HOLES ((&replies[source].poststat)) && + offset > replies[i].poststat.ia_size && + !is_last_block (offset, size, + replies[source].poststat.ia_size) && + (iov_0filled (iovec, count) == 0)) + continue; + + ret = syncop_writev (priv->children[i], fd, iovec, count, + offset, iobref, 0); + if (ret != iov_length (iovec, count)) { + /* write() failed on this sink. unset the corresponding + member in sinks[] (which is healed_sinks[] in the + caller) so that this server does NOT get considered + as successfully healed. + */ + healed_sinks[i] = 0; + } + } + if (iobref) + iobref_unref (iobref); -out: - return NULL; + return ret; } static int -sh_zero_byte_files_exist (afr_local_t *local, int child_count) -{ - int i = 0; - int ret = 0; - afr_self_heal_t *sh = NULL; - - sh = &local->self_heal; - for (i = 0; i < child_count; i++) { - if (!local->child_up[i] || sh->child_errno[i]) - continue; - if (sh->buf[i].ia_size == 0) { - ret = 1; - break; - } - } - - return ret; -} - - -struct afr_sh_algorithm * -afr_sh_data_pick_algo (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - struct afr_sh_algorithm * algo = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - algo = sh_algo_from_name (this, priv->data_self_heal_algorithm); - - if (algo == NULL) { - /* option not set, so fall back on heuristics */ - - if (sh_zero_byte_files_exist (local, priv->child_count) - || (sh->file_size <= (priv->data_self_heal_window_size * - this->ctx->page_size))) { - - /* - * If the file does not exist on one of the subvolumes, - * or a zero-byte file exists (created by entry self-heal) - * the entire content has to be copied anyway, so there - * is no benefit from using the "diff" algorithm. - * - * If the file size is about the same as page size, - * the entire file can be read and written with a few - * (pipelined) STACK_WINDs, which will be faster - * than "diff" which has to read checksums and then - * read and write. - */ - - algo = sh_algo_from_name (this, "full"); - - } else { - algo = sh_algo_from_name (this, "diff"); - } - } - - return algo; -} - - -int -afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - struct afr_sh_algorithm *sh_algo = NULL; - - local = frame->local; - sh = &local->self_heal; - - sh->algo_completion_cbk = afr_sh_data_fsync; - sh->algo_abort_cbk = afr_sh_data_fail; - - sh_algo = afr_sh_data_pick_algo (frame, this); +afr_selfheal_data_block (call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, off_t offset, + size_t size, int type, struct afr_reply *replies) +{ + int ret = -1; + int sink_count = 0; + afr_private_t *priv = NULL; + unsigned char *data_lock = NULL; + + priv = this->private; + sink_count = AFR_COUNT (healed_sinks, priv->child_count); + data_lock = alloca0 (priv->child_count); + + ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, + offset, size, data_lock); + { + if (ret < sink_count) { + ret = -ENOTCONN; + goto unlock; + } - sh->algo = sh_algo; - sh_algo->fn (frame, this); + if (type == AFR_SELFHEAL_DATA_DIFF && + __afr_selfheal_data_checksums_match (frame, this, fd, source, + healed_sinks, offset, size)) { + ret = 0; + goto unlock; + } - return 0; + ret = __afr_selfheal_data_read_write (frame, this, fd, source, + healed_sinks, offset, size, + replies); + } +unlock: + afr_selfheal_uninodelk (frame, this, fd->inode, this->name, + offset, size, data_lock); + return ret; } -int -afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) -{ - int call_count = 0; - int child_index = 0; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "ftruncate of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "ftruncate of %s on subvolume %s completed", - local->loc.path, - priv->children[child_index]->name); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) - afr_sh_data_fail (frame, this); - else - afr_sh_data_sync_prepare (frame, this); - } - - return 0; -} -int -afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) +static int +afr_selfheal_data_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *healed_sinks) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; - int *sources = NULL; - int call_count = 0; - int i = 0; - - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sources = sh->sources; - call_count = sh->active_sinks; - - local->call_count = call_count; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; - for (i = 0; i < priv->child_count; i++) { - if (sources[i] || !local->child_up[i]) - continue; + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->ftruncate, - sh->healing_fd, sh->file_size, - NULL); + AFR_ONLIST (healed_sinks, frame, attr_cbk, fsync, fd, 0, NULL); - if (!--call_count) - break; - } - - return 0; + for (i = 0; i < priv->child_count; i++) + if (healed_sinks[i] && local->replies[i].op_ret != 0) + /* fsync() failed. Do NOT consider this server + as successfully healed. Mark it so. + */ + healed_sinks[i] = 0; + return 0; } -int -afr_sh_inode_set_read_ctx (afr_self_heal_t *sh, xlator_t *this) -{ - afr_private_t *priv = NULL; - int ret = 0; - int i = 0; - - priv = this->private; - sh->source = afr_sh_select_source (sh->sources, priv->child_count); - if (sh->source < 0) { - ret = -1; - goto out; - } - - /* detect changes not visible through pending flags -- JIC */ - for (i = 0; i < priv->child_count; i++) { - if (i == sh->source || sh->child_errno[i]) - continue; - - if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[sh->source])) - sh->sources[i] = 0; - } - - afr_reset_children (sh->fresh_children, priv->child_count); - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); - afr_inode_set_read_ctx (this, sh->inode, sh->source, - sh->fresh_children); -out: - return ret; -} -char* -afr_get_sizes_str (afr_local_t *local, struct iatt *bufs, xlator_t *this) -{ - afr_private_t *priv = NULL; - int i = 0; - char num[1024] = {0}; - size_t len = 0; - char *sizes_str = NULL; - size_t off = 0; - char *fmt_str = "%llu bytes on %s, "; - char *child_down = " %s,"; - char *child_unknown = " %s,"; - int down_child_present = 0; - int down_count = 0; - int unknown_count = 0; - int unknown_child_present = 0; - char *down_subvol_1 = " down subvolume is "; - char *unknown_subvol_1 = " unknown subvolume is "; - char *down_subvol_2 = " down subvolumes are "; - char *unknown_subvol_2 = " unknown subvolumes are "; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] == 1) { - len += snprintf (num, sizeof (num), fmt_str, - (unsigned long long) bufs[i].ia_size, - priv->children[i]->name); - } else if (local->child_up[i] == 0) { - len += snprintf (num, sizeof (num), child_down, - priv->children[i]->name); - if (!down_child_present) - down_child_present = 1; - down_count ++; - } else if (local->child_up[i] == -1) { - len += snprintf (num, sizeof (num), child_unknown, - priv->children[i]->name); - if (!unknown_child_present) - unknown_child_present = 1; - unknown_count++; - } - - } - - if (down_child_present) { - if (down_count > 1) - len += snprintf (num, sizeof (num), "%s", - down_subvol_2); - else - len += snprintf (num, sizeof (num), "%s", - down_subvol_1); - } - if (unknown_child_present) { - if (unknown_count > 1) - len += snprintf (num, sizeof (num), "%s", - unknown_subvol_2); - else - len += snprintf (num, sizeof (num), "%s", - unknown_subvol_1); - } - - len++;//for '\0' - - sizes_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); - - if (!sizes_str) - return NULL; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] == 1) { - off += snprintf (sizes_str + off, len - off, fmt_str, - (unsigned long long) bufs[i].ia_size, - priv->children[i]->name); - } - } - - if (down_child_present) { - if (down_count > 1) { - off += snprintf (sizes_str + off, len - off, "%s", - down_subvol_2); - } else { - off += snprintf (sizes_str + off, len - off, "%s", - down_subvol_1); - } - } - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] == 0) { - off += snprintf (sizes_str + off, len - off, child_down, - priv->children[i]->name); - } - } - - if (unknown_child_present) { - if (unknown_count > 1) { - off += snprintf (sizes_str + off, len - off, "%s", - unknown_subvol_2); - } else { - off += snprintf (sizes_str + off, len - off, "%s", - unknown_subvol_1); - } - } - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] == -1) { - off += snprintf (sizes_str + off, len - off, - child_unknown, - priv->children[i]->name); - - } - } - - return sizes_str; -} - -char* -afr_get_sinks_str (xlator_t *this, afr_local_t *local, afr_self_heal_t *sh) +static int +afr_selfheal_data_restore_time (call_frame_t *frame, xlator_t *this, + inode_t *inode, int source, + unsigned char *healed_sinks, + struct afr_reply *replies) { - afr_private_t *priv = NULL; - int i = 0; - char num[1024] = {0}; - size_t len = 0; - char *sinks_str = NULL; - char *temp_str = " to sinks "; - char *str_format = " %s,"; - char off = 0; - - priv = this->private; - - len += snprintf (num, sizeof (num), "%s", temp_str); - for (i = 0; i < priv->child_count; i++) { - if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { - len += snprintf (num, sizeof (num), str_format, - priv->children[i]->name); - } - } + loc_t loc = {0, }; - len ++; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - sinks_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); + AFR_ONLIST (healed_sinks, frame, attr_cbk, setattr, &loc, + &replies[source].poststat, + (GF_SET_ATTR_ATIME|GF_SET_ATTR_MTIME), NULL); - if (!sinks_str) - return NULL; - - off += snprintf (sinks_str + off, len - off, "%s", temp_str); - - for (i = 0; i < priv->child_count; i++) { - if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { - off += snprintf (sinks_str + off, len - off, - str_format, - priv->children[i]->name); - } - } - - return sinks_str; + loc_wipe (&loc); + return 0; } +static int +afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, + struct afr_reply *replies) +{ + afr_private_t *priv = NULL; + int i = 0; + off_t off = 0; + size_t block = 128 * 1024; + int type = AFR_SELFHEAL_DATA_FULL; + int ret = -1; + call_frame_t *iter_frame = NULL; + char *sinks_str = NULL; + char *p = NULL; + + priv = this->private; + + sinks_str = alloca0 (priv->child_count * 8); + p = sinks_str; + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + p += sprintf (p, "%d ", i); + } -void -afr_set_data_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, xlator_t *this) -{ - char *pending_matrix_str = NULL; - char *sizes_str = NULL; - char *sinks_str = NULL; - afr_private_t *priv = NULL; - - priv = this->private; - - pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix, - this); - if (!pending_matrix_str) - pending_matrix_str = ""; - - sizes_str = afr_get_sizes_str (local, sh->buf, this); - if (!sizes_str) - sizes_str = ""; + gf_log (this->name, GF_LOG_INFO, "performing data selfheal on %s. " + "source=%d sinks=%s", + uuid_utoa (fd->inode->gfid), source, sinks_str); - sinks_str = afr_get_sinks_str (this, local, sh); - if (!sinks_str) - sinks_str = ""; + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i] && i != source) + continue; + if (replies[i].poststat.ia_size) { + type = AFR_SELFHEAL_DATA_DIFF; + break; + } + } - gf_asprintf (&sh->data_sh_info, " data self heal from %s %s with " - "%s data %s", priv->children[sh->source]->name, sinks_str, - sizes_str, pending_matrix_str); + iter_frame = afr_copy_frame (frame); + if (!iter_frame) + return -ENOMEM; - if (pending_matrix_str && strcmp (pending_matrix_str, "")) - GF_FREE (pending_matrix_str); + for (off = 0; off < replies[source].poststat.ia_size; off += block) { + ret = afr_selfheal_data_block (iter_frame, this, fd, source, + healed_sinks, off, block, type, + replies); + if (ret < 0) + goto out; - if (sizes_str && strcmp (sizes_str, "")) - GF_FREE (sizes_str); -} + AFR_STACK_RESET (iter_frame); + } -void -afr_sh_data_fix (call_frame_t *frame, xlator_t *this) -{ - int source = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - sh->block_size = this->ctx->page_size; - sh->file_size = sh->buf[source].ia_size; - - if (FILE_HAS_HOLES (&sh->buf[source])) - sh->file_has_holes = 1; - - if (sh->background && sh->unwind && !sh->unwound) { - sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, - is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)); - sh->unwound = _gf_true; - } - - afr_sh_mark_source_sinks (frame, this); - if (sh->active_sinks == 0) { - gf_log (this->name, GF_LOG_INFO, - "no active sinks for performing self-heal on file %s", - local->loc.path); - afr_sh_data_finish (frame, this); - return; - } - - gf_log (this->name, GF_LOG_DEBUG, - "self-healing file %s from subvolume %s to %d other", - local->loc.path, priv->children[sh->source]->name, - sh->active_sinks); - - sh->actual_sh_started = _gf_true; - afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); - afr_sh_data_trim_sinks (frame, this); -} + afr_selfheal_data_restore_time (frame, this, fd->inode, source, + healed_sinks, replies); -int -afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int nsources = 0; - int ret = 0; - int *old_sources = NULL; - int tstamp_source = 0; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - gf_log (this->name, GF_LOG_DEBUG, "Pending matrix for: %s", - lkowner_utoa (&frame->root->lk_owner)); - if (sh->sync_done) { - //store sources before sync so that mtime can be set using the - //iatt buf from one of them. - old_sources = alloca (priv->child_count*sizeof (*old_sources)); - memcpy (old_sources, sh->sources, - priv->child_count * sizeof (*old_sources)); - } - - nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, - sh->sources, sh->success_children, - AFR_DATA_TRANSACTION, NULL, _gf_true); - if ((nsources == -1) - && (priv->favorite_child != -1) - && (sh->child_errno[priv->favorite_child] == 0)) { - - gf_log (this->name, GF_LOG_DEBUG, - "Picking favorite child %s as authentic source to " - "resolve conflicting data of %s", - priv->children[priv->favorite_child]->name, - local->loc.path); - - sh->sources[priv->favorite_child] = 1; - - nsources = afr_sh_source_count (sh->sources, - priv->child_count); - } - - if (nsources == -1) { - afr_sh_print_split_brain_log (sh->pending_matrix, this, - local->loc.path); - afr_set_split_brain (this, sh->inode, DONT_KNOW, SPB); - - afr_sh_data_fail (frame, this); - return 0; - } - - afr_set_split_brain (this, sh->inode, DONT_KNOW, NO_SPB); - - ret = afr_sh_inode_set_read_ctx (sh, this); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "No active sources found."); - - afr_sh_data_fail (frame, this); - return 0; - } - - if (sh->sync_done) { - /* Perform setattr from one of the old_sources if possible - * Because only they have the correct mtime, the new sources - * (i.e. old sinks) have mtime from last writev in sync. - */ - tstamp_source = sh->source; - for (i = 0; i < priv->child_count; i++) { - if (old_sources[i] && sh->sources[i]) - tstamp_source = i; - } - afr_sh_data_setattr (frame, this, &sh->buf[tstamp_source]); - } else { - afr_set_data_sh_info_str (local, sh, this); - if (nsources == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "No self-heal needed for %s", - local->loc.path); - - afr_sh_data_finish (frame, this); - return 0; - } - - if (sh->do_data_self_heal && - afr_data_self_heal_enabled (priv->data_self_heal)) - afr_sh_data_fix (frame, this); - else - afr_sh_data_finish (frame, this); - } - return 0; -} + ret = afr_selfheal_data_fsync (frame, this, fd, healed_sinks); -int -afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, - dict_t **xattr, - afr_transaction_type txn_type, - uuid_t gfid) -{ - afr_private_t *priv = NULL; - int read_child = -1; - int32_t **pending_matrix = NULL; - int32_t *sources = NULL; - int32_t *success_children = NULL; - struct iatt *bufs = NULL; - int32_t nsources = 0; - int32_t prev_read_child = -1; - int32_t config_read_child = -1; - int32_t subvol_status = 0; - - priv = this->private; - bufs = local->cont.lookup.bufs; - success_children = local->cont.lookup.success_children; - - pending_matrix = local->cont.lookup.pending_matrix; - sources = local->cont.lookup.sources; - memset (sources, 0, sizeof (*sources) * priv->child_count); - - nsources = afr_build_sources (this, xattr, bufs, pending_matrix, - sources, success_children, txn_type, - &subvol_status, _gf_false); - if (subvol_status & SPLIT_BRAIN) { - gf_log (this->name, GF_LOG_DEBUG, "%s: Possible split-brain", - local->loc.path); - switch (txn_type) { - case AFR_DATA_TRANSACTION: - local->cont.lookup.possible_spb = _gf_true; - nsources = 1; - sources[success_children[0]] = 1; - break; - case AFR_ENTRY_TRANSACTION: - read_child = afr_get_no_xattr_dir_read_child (this, - success_children, - bufs); - sources[read_child] = 1; - nsources = 1; - break; - default: - break; - } - } - if (nsources < 0) - goto out; - - prev_read_child = local->read_child_index; - config_read_child = priv->read_child; - read_child = afr_select_read_child_from_policy (success_children, - priv->child_count, - prev_read_child, - config_read_child, - sources, - priv->hash_mode, gfid); out: - gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d", - read_child); - return read_child; + if (iter_frame) + AFR_STACK_DESTROY (iter_frame); + return ret; } -int -afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf, dict_t *xdata) + +static int +__afr_selfheal_truncate_sinks (call_frame_t *frame, xlator_t *this, + fd_t *fd, unsigned char *healed_sinks, + struct afr_reply *replies, uint64_t size) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int call_count = -1; - int child_index = (long) cookie; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret != -1) { - gf_log (this->name, GF_LOG_TRACE, - "fstat of %s on %s succeeded", - local->loc.path, - priv->children[child_index]->name); - - sh->buf[child_index] = *buf; - sh->success_children[sh->success_count] = child_index; - sh->success_count++; - } else { - gf_log (this->name, GF_LOG_ERROR, "%s: fstat failed " - "on %s, reason %s", local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - sh->child_errno[child_index] = op_errno; - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - /* Previous versions of glusterfs might have set - * the pending data xattrs which need to be erased - */ - if (!afr_sh_data_proceed (sh->success_count)) { - gf_log (this->name, GF_LOG_ERROR, "inspecting metadata " - "succeeded on < %d children, aborting " - "self-heal for %s", AFR_SH_MIN_PARTICIPANTS, - local->loc.path); - afr_sh_data_fail (frame, this); - goto out; - } - afr_sh_data_fxattrop_fstat_done (frame, this); - } -out: - return 0; -} + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + unsigned char *larger_sinks = 0; + int i = 0; + local = frame->local; + priv = this->private; -int -afr_sh_data_fstat (call_frame_t *frame, xlator_t *this) -{ - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - int child = 0; - int32_t *fstat_children = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - fstat_children = memdup (sh->success_children, - sizeof (*fstat_children) * priv->child_count); - if (!fstat_children) { - afr_sh_data_fail (frame, this); - goto out; - } - call_count = sh->success_count; - local->call_count = call_count; - - memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count); - afr_reset_children (sh->success_children, priv->child_count); - sh->success_count = 0; - for (i = 0; i < priv->child_count; i++) { - child = fstat_children[i]; - if (child == -1) - break; - STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk, - (void *) (long) child, - priv->children[child], - priv->children[child]->fops->fstat, - sh->healing_fd, NULL); - --call_count; - } - GF_ASSERT (!call_count); -out: - GF_FREE (fstat_children); - return 0; -} + larger_sinks = alloca0 (priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (healed_sinks[i] && replies[i].poststat.ia_size > size) + larger_sinks[i] = 1; + } -void -afr_sh_common_fxattrop_resp_handler (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int child_index = (long) cookie; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret != -1) { - gf_log (this->name, GF_LOG_TRACE, - "fxattrop of %s on %s succeeded", - local->loc.path, - priv->children[child_index]->name); - - sh->xattr[child_index] = dict_ref (xattr); - sh->success_children[sh->success_count] = child_index; - sh->success_count++; - } else { - gf_log (this->name, GF_LOG_ERROR, "fxattrop of %s " - "failed on %s, reason %s", local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - sh->child_errno[child_index] = op_errno; - } - } - UNLOCK (&frame->lock); -} + AFR_ONLIST (larger_sinks, frame, attr_cbk, ftruncate, fd, size, NULL); -int -afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr, dict_t *xdata) -{ - int call_count = -1; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - afr_sh_common_fxattrop_resp_handler (frame, cookie, this, op_ret, - op_errno, xattr); - - call_count = afr_frame_return (frame); - if (call_count == 0) { - if (!afr_sh_data_proceed (sh->success_count)) { - gf_log (this->name, GF_LOG_ERROR, "%s, inspecting " - "change log succeeded on < %d children", - local->loc.path, AFR_SH_MIN_PARTICIPANTS); - afr_sh_data_fail (frame, this); - goto out; - } - afr_sh_data_fstat (frame, this); - } -out: - return 0; + for (i = 0; i < priv->child_count; i++) + if (healed_sinks[i] && local->replies[i].op_ret == -1) + /* truncate() failed. Do NOT consider this server + as successfully healed. Mark it so. + */ + healed_sinks[i] = 0; + return 0; } - -int -afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this) -{ - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - dict_t **xattr_req; - int32_t *zero_pending = NULL; - int call_count = 0; - int i = 0; - int ret = 0; - int j; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - call_count = afr_up_children_count (local->child_up, - priv->child_count); - - local->call_count = call_count; - - xattr_req = GF_CALLOC(priv->child_count, sizeof(struct dict_t *), - gf_afr_mt_dict_t); - if (!xattr_req) - goto out; +/* + * If by chance there are multiple sources with differing sizes, select + * the largest file as the source. + * + * This can only happen if data was directly modified in the backend. + */ +static int +__afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources, + unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *locked_on, + struct afr_reply *replies) +{ + int i = 0; + afr_private_t *priv = NULL; + uint64_t size = 0; + int source = -1; + int locked_count = 0; + int sources_count = 0; + int healed_sinks_count = 0; + + priv = this->private; + + locked_count = AFR_COUNT (locked_on, priv->child_count); + sources_count = AFR_COUNT (sources, priv->child_count); + healed_sinks_count = AFR_COUNT (healed_sinks, priv->child_count); + + if (locked_count == healed_sinks_count || !sources_count) { + /* split brain */ + return -EIO; + } for (i = 0; i < priv->child_count; i++) { - xattr_req[i] = dict_new(); - if (!xattr_req[i]) { - ret = -1; - goto out; + if (!sources[i]) + continue; + if (size <= replies[i].poststat.ia_size) { + size = replies[i].poststat.ia_size; + source = i; } } for (i = 0; i < priv->child_count; i++) { - for (j = 0; j < priv->child_count; j++) { - zero_pending = GF_CALLOC (3, sizeof (*zero_pending), - gf_afr_mt_int32_t); - if (!zero_pending) { - ret = -1; - goto out; - } - ret = dict_set_dynptr (xattr_req[i], priv->pending_key[j], - zero_pending, - 3 * sizeof (*zero_pending)); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value"); - goto out; - } else { - zero_pending = NULL; - } + if (!sources[i]) + continue; + if (replies[i].poststat.ia_size < size) { + sources[i] = 0; + sinks[i] = 1; } } - afr_reset_xattr (sh->xattr, priv->child_count); - afr_reset_children (sh->success_children, priv->child_count); - memset (sh->child_errno, 0, - sizeof (*sh->child_errno) * priv->child_count); - sh->success_count = 0; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_sh_data_fxattrop_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - sh->healing_fd, GF_XATTROP_ADD_ARRAY, - xattr_req[i], NULL); - - if (!--call_count) - break; - } - } - -out: - if (xattr_req) { - for (i = 0; i < priv->child_count; i++) - if (xattr_req[i]) - dict_unref(xattr_req[i]); - GF_FREE(xattr_req); - } - - if (ret) { - GF_FREE (zero_pending); - afr_sh_data_fail (frame, this); - } - - return 0; + return source; } -int -afr_sh_data_big_lock_success (call_frame_t *frame, xlator_t *this) +/* + * __afr_selfheal_data_prepare: + * + * This function inspects the on-disk xattrs and determines which subvols + * are sources and sinks. + * + * The return value is the index of the subvolume to be used as the source + * for self-healing, or -1 if no healing is necessary/split brain. + */ +static int +__afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, + struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + int i = 0; - local = frame->local; - sh = &local->self_heal; + priv = this->private; - sh->data_lock_held = _gf_true; - afr_sh_data_fxattrop (frame, this); - return 0; -} + ret = afr_selfheal_unlocked_discover (frame, fd->inode, fd->inode->gfid, + replies); + if (ret) + return ret; -int -afr_sh_dom_lock_success (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + ret = afr_selfheal_find_direction (frame, this, replies, + AFR_DATA_TRANSACTION, + locked_on, sources, sinks); + if (ret) + return ret; - local = frame->local; - sh = &local->self_heal; + source = __afr_selfheal_data_finalize_source (this, sources, sinks, + healed_sinks, locked_on, + replies); + if (source < 0) + return -EIO; - sh->sh_dom_lock_held = _gf_true; - afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, - afr_sh_data_big_lock_success, - afr_sh_data_fail); - return 0; -} + for (i = 0; i < priv->child_count; i++) + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). -int -afr_sh_data_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + healed_sinks[i] = sinks[i] && locked_on[i]; - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; + return source; +} - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "Blocking data inodelks " - "failed for %s. by %s", - local->loc.path, lkowner_utoa (&frame->root->lk_owner)); - sh->data_lock_failure_handler (frame, this); - } else { +static int +__afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on) +{ + afr_private_t *priv = NULL; + int ret = -1; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *data_lock = NULL; + unsigned char *healed_sinks = NULL; + struct afr_reply *locked_replies = NULL; + int source = -1; + gf_boolean_t compat = _gf_false; + unsigned char *compat_lock = NULL; + + priv = this->private; + + sources = alloca0 (priv->child_count); + sinks = alloca0 (priv->child_count); + healed_sinks = alloca0 (priv->child_count); + data_lock = alloca0 (priv->child_count); + compat_lock = alloca0 (priv->child_count); + + locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); + + ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, 0, 0, + data_lock); + { + if (ret < 2) { + ret = -ENOTCONN; + goto unlock; + } - gf_log (this->name, GF_LOG_DEBUG, "Blocking data inodelks " - "done for %s by %s. Proceding to self-heal", - local->loc.path, lkowner_utoa (&frame->root->lk_owner)); + ret = __afr_selfheal_data_prepare (frame, this, fd, data_lock, + sources, sinks, healed_sinks, + locked_replies); + if (ret < 0) + goto unlock; + + source = ret; + + ret = __afr_selfheal_truncate_sinks (frame, this, fd, healed_sinks, + locked_replies, + locked_replies[source].poststat.ia_size); + if (ret < 0) + goto unlock; + + ret = 0; + + /* Locking from (LLONG_MAX - 2) to (LLONG_MAX - 1) is for + compatibility with older self-heal clients which do not + hold a lock in the @priv->sh_domain domain to guard + against concurrent ongoing self-heals + */ + afr_selfheal_inodelk (frame, this, fd->inode, this->name, + LLONG_MAX - 2, 1, compat_lock); + compat = _gf_true; + } +unlock: + afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0, + data_lock); + if (ret < 0) + goto out; - sh->data_lock_success_handler (frame, this); - } + ret = afr_selfheal_data_do (frame, this, fd, source, healed_sinks, + locked_replies); + if (ret) + goto out; - return 0; + ret = afr_selfheal_undo_pending (frame, this, fd->inode, sources, sinks, + healed_sinks, AFR_DATA_TRANSACTION, + locked_replies, data_lock); +out: + if (compat) + afr_selfheal_uninodelk (frame, this, fd->inode, this->name, + LLONG_MAX - 2, 1, compat_lock); + return ret; } -int -afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks " - "failed for %s. by %s", - local->loc.path, lkowner_utoa (&frame->root->lk_owner)); - - if (!sh->data_lock_block) { - sh->data_lock_failure_handler(frame, this); - } else { - int_lock->lock_cbk = - afr_sh_data_post_blocking_inodelk_cbk; - afr_blocking_lock (frame, this); - } - } else { - - gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks " - "done for %s by %s. Proceeding to self-heal", - local->loc.path, lkowner_utoa (&frame->root->lk_owner)); - sh->data_lock_success_handler (frame, this); - } - - return 0; -} -int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, char *dom, - off_t start, off_t len) +static fd_t * +afr_selfheal_data_open (xlator_t *this, inode_t *inode) { - afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->transaction_lk_type = AFR_SELFHEAL_LK; - int_lock->selfheal_lk_type = AFR_DATA_SELF_HEAL_LK; + loc_t loc = {0,}; + int ret = 0; + fd_t *fd = NULL; - afr_set_lock_number (frame, this); + fd = fd_create (inode, 0); + if (!fd) + return NULL; - int_lock->lock_cbk = afr_sh_data_post_nonblocking_inodelk_cbk; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - int_lock->domain = dom; - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - inodelk->flock.l_start = start; - inodelk->flock.l_len = len; - inodelk->flock.l_type = F_WRLCK; - - afr_nonblocking_inodelk (frame, this); - - return 0; -} - -int -afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - GF_ASSERT (sh->old_loop_frame); - sh_loop_finish (sh->old_loop_frame, this); - sh->old_loop_frame = NULL; - sh->data_lock_held = _gf_true; - sh->sync_done = _gf_true; - afr_sh_data_fxattrop (frame, this); - return 0; -} - -int -afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + ret = syncop_open (this, &loc, O_RDWR|O_LARGEFILE, fd); + if (ret) { + fd_unref (fd); + fd = NULL; + } else { + fd_bind (fd); + } - local = frame->local; - sh = &local->self_heal; + loc_wipe (&loc); - GF_ASSERT (sh->old_loop_frame); - sh_loop_finish (sh->old_loop_frame, this); - sh->old_loop_frame = NULL; - afr_sh_set_timestamps (frame, this); - return 0; + return fd; } int -afr_sh_data_lock (call_frame_t *frame, xlator_t *this, - off_t start, off_t len, gf_boolean_t block, - char *dom, afr_lock_cbk_t success_handler, - afr_lock_cbk_t failure_handler) +afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode) { - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - sh->data_lock_success_handler = success_handler; - sh->data_lock_failure_handler = failure_handler; - sh->data_lock_block = block; - return afr_sh_data_lock_rec (frame, this, dom, start, len); -} + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + int ret = 0; + fd_t *fd = NULL; -int -afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - /* TODO: some of the open's might fail. - In that case, modify cleanup fn to send flush on those - fd's which are already open */ - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "open of %s failed on child %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } else { - gf_log (this->name, GF_LOG_TRACE, - "open of %s succeeded on child %s", - local->loc.path, - priv->children[child_index]->name); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - afr_sh_data_fail (frame, this); - return 0; - } - - gf_log (this->name, GF_LOG_TRACE, - "fd for %s opened, commencing sync", - local->loc.path); - - afr_sh_data_lock (frame, this, 0, 0, _gf_true, priv->sh_domain, - afr_sh_dom_lock_success, afr_sh_data_fail); - } - - return 0; -} + priv = this->private; + fd = afr_selfheal_data_open (this, inode); + if (!fd) + return -EIO; -int -afr_sh_data_open (call_frame_t *frame, xlator_t *this) -{ - int i = 0; - int call_count = 0; - fd_t *fd = NULL; - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - call_count = afr_up_children_count (local->child_up, priv->child_count); - local->call_count = call_count; - - fd = fd_create (local->loc.inode, frame->root->pid); - sh->healing_fd = fd; - - /* open sinks */ - for (i = 0; i < priv->child_count; i++) { - if(!local->child_up[i]) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->open, - &local->loc, - O_RDWR|O_LARGEFILE, fd, NULL); - - if (!--call_count) - break; - } - - return 0; -} + locked_on = alloca0 (priv->child_count); -void -afr_sh_non_reg_fix (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - int i = 0; - - if (op_ret < 0) { - afr_sh_data_fail (frame, this); - return; - } - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - for (i = 0; i < priv->child_count ; i++) { - if (1 == local->child_up[i]) - sh->success[i] = 1; - } - - afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION, - afr_sh_data_erase_pending_cbk, - afr_sh_data_finish); -} + ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain, 0, 0, + locked_on); + { + if (ret < 2) { + /* Either less than two subvols available, or another + selfheal (from another server) is in progress. Skip + for now in any case there isn't anything to do. + */ + ret = -ENOTCONN; + goto unlock; + } -int -afr_sh_non_reg_lock_success (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - sh->data_lock_held = _gf_true; - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_non_reg_fix, NULL, - AFR_LOOKUP_FAIL_CONFLICTS | - AFR_LOOKUP_FAIL_MISSING_GFIDS, - NULL); - return 0; -} + ret = __afr_selfheal_data (frame, this, fd, locked_on); + } +unlock: + afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on); -gf_boolean_t -afr_can_start_data_self_heal (afr_self_heal_t *sh, afr_private_t *priv) -{ - if (sh->force_confirm_spb) - return _gf_true; - if (sh->do_data_self_heal && - afr_data_self_heal_enabled (priv->data_self_heal)) - return _gf_true; - return _gf_false; -} + if (fd) + fd_unref (fd); -int -afr_self_heal_data (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = this->private; - int ret = -1; - - local = frame->local; - sh = &local->self_heal; - - sh->sh_type_in_action = AFR_SELF_HEAL_DATA; - - if (afr_can_start_data_self_heal (sh, priv)) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); - ret = afr_inodelk_init (&local->internal_lock.inodelk[1], - priv->sh_domain, priv->child_count); - if (ret < 0) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_data_done (frame, this); - return 0; - } - - if (IA_ISREG (sh->type)) { - afr_sh_data_open (frame, this); - } else { - afr_sh_data_lock (frame, this, 0, 0, _gf_true, - this->name, - afr_sh_non_reg_lock_success, - afr_sh_data_fail); - } - } else { - gf_log (this->name, GF_LOG_TRACE, - "not doing data self heal on %s", - local->loc.path); - afr_sh_data_done (frame, this); - } - - return 0; + return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 00f1a9cb91b..9605d69f417 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,2399 +8,624 @@ cases as published by the Free Software Foundation. */ -#include <libgen.h> -#include <unistd.h> -#include <fnmatch.h> -#include <sys/time.h> -#include <stdlib.h> -#include <signal.h> #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" #endif -#include "glusterfs.h" -#include "inode.h" #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" +#include "afr-self-heal.h" #include "byte-order.h" - #include "afr-transaction.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" - -#define AFR_INIT_SH_FRAME_VALS(_frame, _local, _sh, _sh_frame, _sh_local, _sh_sh)\ - do {\ - _local = _frame->local;\ - _sh = &_local->self_heal;\ - _sh_frame = _sh->sh_frame;\ - _sh_local = _sh_frame->local;\ - _sh_sh = &_sh_local->self_heal;\ - } while (0); - -int -afr_sh_entry_impunge_create_file (call_frame_t *impunge_frame, xlator_t *this, - int child_index); -int -afr_sh_entry_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - sh->completion_cbk (frame, this); - - return 0; -} - - -int -afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->lock_cbk = afr_sh_entry_done; - afr_unlock (frame, this); - - return 0; -} - - -int -afr_sh_entry_finish (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - gf_log (this->name, GF_LOG_TRACE, - "finishing entry selfheal of %s", local->loc.path); - - afr_sh_entry_unlock (frame, this); - - return 0; -} - - -int -afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr, dict_t *xdata) -{ - long i = 0; - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_local_t *orig_local = NULL; - call_frame_t *orig_frame = NULL; - afr_private_t *priv = NULL; - int32_t read_child = -1; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - i = (long)cookie; - - - afr_children_add_child (sh->fresh_children, i, priv->child_count); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to erase pending xattrs on %s (%s)", - local->loc.path, priv->children[i]->name, - strerror (op_errno)); - } - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (sh->source == -1) { - //this happens if the forced merge option is set - read_child = sh->fresh_children[0]; - } else { - read_child = sh->source; - } - afr_inode_set_read_ctx (this, sh->inode, read_child, - sh->fresh_children); - orig_frame = sh->orig_frame; - orig_local = orig_frame->local; - - if (sh->source != -1) { - orig_local->cont.lookup.buf.ia_nlink = sh->buf[sh->source].ia_nlink; - } - - afr_sh_entry_finish (frame, this); - } - - return 0; -} - - -int -afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - if (sh->entries_skipped) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - goto out; - } - afr_sh_erase_pending (frame, this, AFR_ENTRY_TRANSACTION, - afr_sh_entry_erase_pending_cbk, - afr_sh_entry_finish); - return 0; -out: - afr_sh_entry_finish (frame, this); - return 0; -} - - - -static int -next_active_source (call_frame_t *frame, xlator_t *this, - int current_active_source) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int source = -1; - int next_active_source = -1; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - source = sh->source; - - if (source != -1) { - if (current_active_source != source) - next_active_source = source; - goto out; - } - - /* - the next active sink becomes the source for the - 'conservative decision' of merging all entries - */ - - for (i = 0; i < priv->child_count; i++) { - if ((sh->sources[i] == 0) - && (local->child_up[i] == 1) - && (i > current_active_source)) { - - next_active_source = i; - break; - } - } -out: - return next_active_source; -} - static int -next_active_sink (call_frame_t *frame, xlator_t *this, - int current_active_sink) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int next_active_sink = -1; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - /* - the next active sink becomes the source for the - 'conservative decision' of merging all entries - */ - - for (i = 0; i < priv->child_count; i++) { - if ((sh->sources[i] == 0) - && (local->child_up[i] == 1) - && (i > current_active_sink)) { - - next_active_sink = i; - break; - } - } - - return next_active_sink; -} - -int -afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this); - -int -afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this); +afr_selfheal_entry_delete (call_frame_t *frame, xlator_t *this, inode_t *dir, + const char *name, inode_t *inode, int child, + struct afr_reply *replies) +{ + afr_private_t *priv = NULL; + xlator_t *subvol = NULL; + int ret = 0; + loc_t loc = {0, }; + char g[64]; + + priv = this->private; + + subvol = priv->children[child]; + + loc.parent = inode_ref (dir); + uuid_copy (loc.pargfid, dir->gfid); + loc.name = name; + loc.inode = inode_ref (inode); + + if (replies[child].valid && replies[child].op_ret == 0) { + switch (replies[child].poststat.ia_type) { + case IA_IFDIR: + gf_log (this->name, GF_LOG_WARNING, + "expunging dir %s/%s (%s) on %s", + uuid_utoa (dir->gfid), name, + uuid_utoa_r (replies[child].poststat.ia_gfid, g), + subvol->name); + ret = syncop_rmdir (subvol, &loc, 1); + break; + default: + gf_log (this->name, GF_LOG_WARNING, + "expunging file %s/%s (%s) on %s", + uuid_utoa (dir->gfid), name, + uuid_utoa_r (replies[child].poststat.ia_gfid, g), + subvol->name); + ret = syncop_unlink (subvol, &loc); + break; + } + } + + loc_wipe (&loc); + + return ret; +} + + +int +afr_selfheal_recreate_entry (call_frame_t *frame, xlator_t *this, int dst, + int source, inode_t *dir, const char *name, + inode_t *inode, struct afr_reply *replies) +{ + int ret = 0; + loc_t loc = {0,}; + loc_t srcloc = {0,}; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + struct iatt *iatt = NULL; + char *linkname = NULL; + mode_t mode = 0; + struct iatt newent = {0,}; + + priv = this->private; + + xdata = dict_new(); + if (!xdata) + return -ENOMEM; + + loc.parent = inode_ref (dir); + uuid_copy (loc.pargfid, dir->gfid); + loc.name = name; + loc.inode = inode_ref (inode); + + ret = afr_selfheal_entry_delete (frame, this, dir, name, inode, dst, + replies); + if (ret) + goto out; + + ret = dict_set_static_bin (xdata, "gfid-req", + replies[source].poststat.ia_gfid, 16); + if (ret) + goto out; + + iatt = &replies[source].poststat; + + srcloc.inode = inode_ref (inode); + uuid_copy (srcloc.gfid, iatt->ia_gfid); + + mode = st_mode_from_ia (iatt->ia_prot, iatt->ia_type); + + switch (iatt->ia_type) { + case IA_IFDIR: + ret = syncop_mkdir (priv->children[dst], &loc, mode, xdata, 0); + break; + case IA_IFLNK: + ret = syncop_lookup (priv->children[dst], &srcloc, 0, 0, 0, 0); + if (ret == 0) { + ret = syncop_link (priv->children[dst], &srcloc, &loc); + } else { + ret = syncop_readlink (priv->children[source], &srcloc, + &linkname, 4096); + if (ret <= 0) + goto out; + ret = syncop_symlink (priv->children[dst], &loc, linkname, + xdata, NULL); + } + break; + default: + ret = dict_set_int32 (xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1); + if (ret) + goto out; + ret = syncop_mknod (priv->children[dst], &loc, mode, + iatt->ia_rdev, xdata, &newent); + if (ret == 0 && iatt->ia_size && !newent.ia_size) { + /* New entry created. Mark @dst pending on all sources */ + ret = 1; + } + break; + } -int -afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this); - -int -afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src); - -int -afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this, - int active_src, int32_t op_ret, - int32_t op_errno) -{ - int call_count = 0; - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_entry_expunge_subvol (frame, this, active_src); - - return 0; -} - -int -afr_sh_entry_expunge_parent_setattr_cbk (call_frame_t *expunge_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop, - dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - call_frame_t *frame = NULL; - int active_src = (long) cookie; - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - - if (op_ret != 0) { - gf_log (this->name, GF_LOG_ERROR, - "setattr on parent directory of %s on subvolume %s failed: %s", - expunge_local->loc.path, - priv->children[active_src]->name, strerror (op_errno)); - } - - AFR_STACK_DESTROY (expunge_frame); - sh->expunge_done (frame, this, active_src, op_ret, op_errno); - - return 0; -} - - -int -afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int active_src = 0; - int32_t valid = 0; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - - active_src = (long) cookie; - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "removed %s on %s", - expunge_local->loc.path, - priv->children[active_src]->name); - } else { - gf_log (this->name, GF_LOG_INFO, - "removing %s on %s failed (%s)", - expunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - } - - valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_parent_setattr_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->setattr, - &expunge_sh->parent_loc, - &expunge_sh->parentbuf, - valid, NULL); - - return 0; -} - - -int -afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - - gf_log (this->name, GF_LOG_TRACE, - "expunging file %s on %s", - expunge_local->loc.path, priv->children[active_src]->name); - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->unlink, - &expunge_local->loc, 0, NULL); - - return 0; -} - - - -int -afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - - gf_log (this->name, GF_LOG_DEBUG, - "expunging directory %s on %s", - expunge_local->loc.path, priv->children[active_src]->name); - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->rmdir, - &expunge_local->loc, 1, NULL); - - return 0; -} - - -int -afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, - int active_src, struct iatt *buf, - struct iatt *parentbuf) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - call_frame_t *frame = NULL; - int type = 0; - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - loc_t *loc = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - loc = &expunge_local->loc; - - type = buf->ia_type; - if (loc->parent && uuid_is_null (loc->parent->gfid)) - uuid_copy (loc->pargfid, parentbuf->ia_gfid); - - switch (type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - case IA_IFLNK: - afr_sh_entry_expunge_unlink (expunge_frame, this, active_src); - break; - case IA_IFDIR: - afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src); - break; - default: - gf_log (this->name, GF_LOG_ERROR, - "%s has unknown file type on %s: 0%o", - expunge_local->loc.path, - priv->children[active_src]->name, type); - goto out; - break; - } - - return 0; -out: - AFR_STACK_DESTROY (expunge_frame); - sh->expunge_done (frame, this, active_src, -1, EINVAL); - - return 0; -} - - -int -afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *x, - struct iatt *postparent) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - call_frame_t *frame = NULL; - int active_src = 0; - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - active_src = (long) cookie; - local = frame->local; - sh = &local->self_heal; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "lookup of %s on %s failed (%s)", - expunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - goto out; - } - - afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf, - postparent); - - return 0; out: - AFR_STACK_DESTROY (expunge_frame); - sh->expunge_done (frame, this, active_src, op_ret, op_errno); - - return 0; + if (xdata) + dict_unref (xdata); + loc_wipe (&loc); + loc_wipe (&srcloc); + return ret; } -int -afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", - expunge_local->loc.path, priv->children[active_src]->name); - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->lookup, - &expunge_local->loc, NULL); - - return 0; -} - -int -afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *x, - struct iatt *postparent) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int source = 0; - call_frame_t *frame = NULL; - int active_src = 0; - int need_expunge = 0; - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - active_src = expunge_sh->active_source; - source = (long) cookie; - local = frame->local; - sh = &local->self_heal; - - if (op_ret == -1 && op_errno == ENOENT) - need_expunge = 1; - else if (op_ret == -1) - goto out; - - if (!uuid_is_null (expunge_sh->entrybuf.ia_gfid) && - !uuid_is_null (buf->ia_gfid) && - (uuid_compare (expunge_sh->entrybuf.ia_gfid, buf->ia_gfid) != 0)) { - char uuidbuf1[64]; - char uuidbuf2[64]; - gf_log (this->name, GF_LOG_DEBUG, - "entry %s found on %s with mismatching gfid (%s/%s)", - expunge_local->loc.path, - priv->children[source]->name, - uuid_utoa_r (expunge_sh->entrybuf.ia_gfid, uuidbuf1), - uuid_utoa_r (buf->ia_gfid, uuidbuf2)); - need_expunge = 1; - } - - if (need_expunge) { - gf_log (this->name, GF_LOG_INFO, - "Entry %s is missing on %s and deleting from " - "replica's other bricks", - expunge_local->loc.path, - priv->children[source]->name); - - if (postparent) - expunge_sh->parentbuf = *postparent; - - afr_sh_entry_expunge_purge (expunge_frame, this, active_src); - - return 0; - } - -out: - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "%s exists under %s", - expunge_local->loc.path, - priv->children[source]->name); - } else { - gf_log (this->name, GF_LOG_INFO, - "looking up %s under %s failed (%s)", - expunge_local->loc.path, - priv->children[source]->name, - strerror (op_errno)); - } - - AFR_STACK_DESTROY (expunge_frame); - sh->expunge_done (frame, this, active_src, op_ret, op_errno); - - return 0; -} - -static gf_boolean_t -can_skip_entry_self_heal (char *name, loc_t *parent_loc) -{ - if (strcmp (name, ".") == 0) { - return _gf_true; - } else if (strcmp (name, "..") == 0) { - return _gf_true; - } else if (loc_is_root (parent_loc) && - (strcmp (name, GF_REPLICATE_TRASH_DIR) == 0)) { - return _gf_true; - } - return _gf_false; -} - -int -afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, - gf_dirent_t *entry) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int ret = -1; - call_frame_t *expunge_frame = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int active_src = 0; - int source = 0; - int op_errno = 0; - char *name = NULL; - int op_ret = -1; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - source = sh->source; - sh->expunge_done = afr_sh_entry_expunge_entry_done; - - name = entry->d_name; - if (can_skip_entry_self_heal (name, &local->loc)) { - op_ret = 0; - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "inspecting existence of %s under %s", - name, local->loc.path); - - expunge_frame = copy_frame (frame); - if (!expunge_frame) { - op_errno = ENOMEM; - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out); - - expunge_frame->local = expunge_local; - expunge_sh = &expunge_local->self_heal; - expunge_sh->sh_frame = frame; - expunge_sh->active_source = active_src; - expunge_sh->entrybuf = entry->d_stat; - loc_copy (&expunge_sh->parent_loc, &local->loc); - - ret = afr_build_child_loc (this, &expunge_local->loc, &local->loc, - name); - if (ret != 0) { - op_errno = EINVAL; - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", expunge_local->loc.path, - priv->children[source]->name); - - STACK_WIND_COOKIE (expunge_frame, - afr_sh_entry_expunge_entry_cbk, - (void *) (long) source, - priv->children[source], - priv->children[source]->fops->lookup, - &expunge_local->loc, NULL); - - ret = 0; -out: - if (ret == -1) - sh->expunge_done (frame, this, active_src, op_ret, op_errno); - - return 0; -} - - -int -afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - gf_dirent_t *entry = NULL; - off_t last_offset = 0; - int active_src = 0; - int entry_count = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - - if (op_ret <= 0) { - if (op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "readdir of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_TRACE, - "readdir of %s on subvolume %s complete", - local->loc.path, - priv->children[active_src]->name); - } - - afr_sh_entry_expunge_all (frame, this); - return 0; - } - - list_for_each_entry (entry, &entries->list, list) { - last_offset = entry->d_off; - entry_count++; - } - - gf_log (this->name, GF_LOG_TRACE, - "readdir'ed %d entries from %s", - entry_count, priv->children[active_src]->name); - - sh->offset = last_offset; - local->call_count = entry_count; - - list_for_each_entry (entry, &entries->list, list) { - afr_sh_entry_expunge_entry (frame, this, entry); - } - - return 0; -} - -int -afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk, - priv->children[active_src], - priv->children[active_src]->fops->readdirp, - sh->healing_fd, sh->block_size, sh->offset, NULL); - - return 0; -} - - -int -afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this) +static int +afr_selfheal_newentry_mark (call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, struct afr_reply *replies, + unsigned char *sources, unsigned char *newentry) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int active_src = -1; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; + int ret = 0; + int i = 0; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int **changelog = NULL; + int idx = 0; - sh->offset = 0; + priv = this->private; - if (sh->source == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "no active sources for %s to expunge entries", - local->loc.path); - goto out; - } + idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - active_src = next_active_sink (frame, this, sh->active_source); - sh->active_source = active_src; + uuid_copy (inode->gfid, replies[source].poststat.ia_gfid); - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - goto out; - } + changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); - if (active_src == -1) { - /* completed creating missing files on all subvolumes */ - goto out; - } + xattr = dict_new(); + if (!xattr) + return -ENOMEM; - gf_log (this->name, GF_LOG_TRACE, - "expunging entries of %s on %s to other sinks", - local->loc.path, priv->children[active_src]->name); + for (i = 0; i < priv->child_count; i++) { + if (!newentry[i]) + continue; + changelog[i][idx] = hton32(1); + } - afr_sh_entry_expunge_subvol (frame, this, active_src); + afr_set_pending_dict (priv, xattr, changelog); - return 0; -out: - afr_sh_entry_impunge_all (frame, this); - return 0; + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + afr_selfheal_post_op (frame, this, inode, i, xattr); + } + dict_unref (xattr); + return ret; } -int -afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - if (op_ret < 0) - sh->entries_skipped = _gf_true; - call_count = afr_frame_return (frame); - if (call_count == 0) - afr_sh_entry_impunge_subvol (frame, this); - - return 0; +static int +__afr_selfheal_heal_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, + char *name, inode_t *inode, int source, + unsigned char *sources, unsigned char *healed_sinks, + unsigned char *locked_on, struct afr_reply *replies) +{ + int ret = 0; + afr_private_t *priv = NULL; + int i = 0; + unsigned char *newentry = NULL; + + priv = this->private; + newentry = alloca0 (priv->child_count); + + if (!replies[source].valid) + return -EIO; + + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + if (replies[source].op_ret == -1 && + replies[source].op_errno == ENOENT) { + ret = afr_selfheal_entry_delete (frame, this, fd->inode, + name, inode, i, replies); + } else { + if (!uuid_compare (replies[i].poststat.ia_gfid, + replies[source].poststat.ia_gfid)) + continue; + + ret = afr_selfheal_recreate_entry (frame, this, i, source, + fd->inode, name, inode, + replies); + if (ret > 0) { + newentry[i] = 1; + ret = 0; + } + } + if (ret < 0) + break; + } + + if (AFR_COUNT (newentry, priv->child_count)) + afr_selfheal_newentry_mark (frame, this, inode, source, replies, + sources, newentry); + return ret; } -void -afr_sh_entry_call_impunge_done (call_frame_t *impunge_frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *impunge_local = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *frame = NULL; - AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, - frame, local, sh); - - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, op_ret, op_errno); -} - -int -afr_sh_entry_impunge_setattr_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop, - dict_t *xdata) +static int +__afr_selfheal_merge_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, + char *name, inode_t *inode, unsigned char *sources, + unsigned char *healed_sinks, unsigned char *locked_on, + struct afr_reply *replies) { - int call_count = 0; - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - int child_index = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - child_index = (long) cookie; - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "setattr done for %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - } else { - gf_log (this->name, GF_LOG_INFO, - "setattr (%s) on %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - } - - call_count = afr_frame_return (impunge_frame); - if (call_count == 0) { - afr_sh_entry_call_impunge_done (impunge_frame, this, - 0, op_errno); - } - - return 0; -} + int ret = 0; + afr_private_t *priv = NULL; + int i = 0; + int source = -1; -int -afr_sh_entry_impunge_parent_setattr_cbk (call_frame_t *setattr_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop, - dict_t *xdata) -{ - int call_count = 0; - afr_local_t *setattr_local = NULL; - - setattr_local = setattr_frame->local; - if (op_ret != 0) { - gf_log (this->name, GF_LOG_INFO, - "setattr on parent directory (%s) failed: %s", - setattr_local->loc.path, strerror (op_errno)); - } - - call_count = afr_frame_return (setattr_frame); - if (call_count == 0) - AFR_STACK_DESTROY (setattr_frame); - return 0; -} + priv = this->private; -int -afr_sh_entry_impunge_setattr (call_frame_t *impunge_frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_local_t *setattr_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *setattr_frame = NULL; - int32_t valid = 0; - int32_t op_errno = 0; - int child_index = 0; - int call_count = 0; - int i = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - - gf_log (this->name, GF_LOG_DEBUG, - "setting ownership of %s on %s to %d/%d", - impunge_local->loc.path, - priv->children[child_index]->name, - impunge_sh->entrybuf.ia_uid, - impunge_sh->entrybuf.ia_gid); - - setattr_frame = copy_frame (impunge_frame); - if (!setattr_frame) { - op_errno = ENOMEM; - goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (setattr_frame->local, out); - setattr_local = setattr_frame->local; - call_count = afr_errno_count (NULL, impunge_sh->child_errno, - priv->child_count, 0); - loc_copy (&setattr_local->loc, &impunge_sh->parent_loc); - impunge_local->call_count = call_count; - setattr_local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (impunge_sh->child_errno[i]) - continue; - valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - STACK_WIND_COOKIE (setattr_frame, - afr_sh_entry_impunge_parent_setattr_cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->setattr, - &setattr_local->loc, - &impunge_sh->parentbuf, valid, NULL); - - valid = GF_SET_ATTR_UID | GF_SET_ATTR_GID | - GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - STACK_WIND_COOKIE (impunge_frame, - afr_sh_entry_impunge_setattr_cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->setattr, - &impunge_local->loc, - &impunge_sh->entrybuf, valid, NULL); - call_count--; - } - GF_ASSERT (!call_count); - return 0; -out: - if (setattr_frame) - AFR_STACK_DESTROY (setattr_frame); - afr_sh_entry_call_impunge_done (impunge_frame, this, 0, op_errno); - return 0; -} + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid && replies[i].op_ret == 0) { + source = i; + break; + } + } -int -afr_sh_entry_impunge_xattrop_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - dict_t *xattr, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - int child_index = 0; - int call_count = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to perform xattrop on %s (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, strerror (op_errno)); - - LOCK (&impunge_frame->lock); - { - impunge_local->op_ret = -1; - impunge_local->op_errno = op_errno; - } - UNLOCK (&impunge_frame->lock); - } - - call_count = afr_frame_return (impunge_frame); - - if (call_count == 0) { - if (impunge_local->op_ret == 0) { - afr_sh_entry_impunge_setattr (impunge_frame, this); - } else { - afr_sh_entry_call_impunge_done (impunge_frame, this, - -1, impunge_local->op_errno); - } - } - return 0; -} + if (source == -1) { + /* entry got deleted in the mean time? */ + return 0; + } -int -afr_sh_entry_impunge_perform_xattrop (call_frame_t *impunge_frame, - xlator_t *this) -{ - int active_src = 0; - dict_t *xattr = NULL; - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int32_t op_errno = 0; - int32_t call_count = 0; - int32_t i = 0; - - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - active_src = impunge_sh->active_source; - impunge_local->op_ret = 0; - - afr_prepare_new_entry_pending_matrix (impunge_local->pending, - afr_is_errno_unset, - impunge_sh->child_errno, - &impunge_sh->entrybuf, - priv->child_count); - xattr = dict_new (); - if (!xattr) { - op_errno = ENOMEM; - goto out; - } - - afr_set_pending_dict (priv, xattr, impunge_local->pending, active_src, - LOCAL_LAST); - - for (i = 0; i < priv->child_count; i++) { - if ((impunge_sh->child_errno[i] == EEXIST) && - (impunge_local->child_up[i] == 1)) - - call_count++; - } - - impunge_local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - - if ((impunge_sh->child_errno[i] == EEXIST) - && (impunge_local->child_up[i] == 1)) { - - - STACK_WIND_COOKIE (impunge_frame, - afr_sh_entry_impunge_xattrop_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &impunge_local->loc, - GF_XATTROP_ADD_ARRAY, xattr, NULL); - if (!--call_count) - break; - } - } - - if (xattr) - dict_unref (xattr); - return 0; -out: - afr_sh_entry_call_impunge_done (impunge_frame, this, - -1, op_errno); - return 0; -} + for (i = 0; i < priv->child_count; i++) { + if (i == source || !healed_sinks[i]) + continue; -int -afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - int call_count = 0; - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - - child_index = (long) cookie; - - if (op_ret == -1) { - impunge_sh->child_errno[child_index] = op_errno; - gf_log (this->name, GF_LOG_ERROR, - "creation of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - } else { - impunge_sh->child_errno[child_index] = 0; - } - - call_count = afr_frame_return (impunge_frame); - if (call_count == 0) { - if (!afr_errno_count (NULL, impunge_sh->child_errno, - priv->child_count, 0)) { - // new_file creation failed every where - afr_sh_entry_call_impunge_done (impunge_frame, this, - -1, op_errno); - goto out; - } - afr_sh_entry_impunge_perform_xattrop (impunge_frame, this); - } -out: - return 0; -} + if (replies[i].op_errno != ENOENT) + continue; -int -afr_sh_entry_impunge_hardlink_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - int call_count = 0; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - - if (IA_IFLNK == impunge_sh->entrybuf.ia_type) { - //For symlinks impunge is attempted un-conditionally - //So the file can already exist. - if ((op_ret < 0) && (op_errno == EEXIST)) - op_ret = 0; - } - - call_count = afr_frame_return (impunge_frame); - if (call_count == 0) - afr_sh_entry_call_impunge_done (impunge_frame, this, - op_ret, op_errno); - - return 0; -} + ret = afr_selfheal_recreate_entry (frame, this, i, source, + fd->inode, name, inode, + replies); + } -int -afr_sh_entry_impunge_hardlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - loc_t *loc = NULL; - struct iatt *buf = NULL; - loc_t oldloc = {0}; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - loc = &impunge_local->loc; - buf = &impunge_sh->entrybuf; - - oldloc.inode = inode_ref (loc->inode); - uuid_copy (oldloc.gfid, buf->ia_gfid); - gf_log (this->name, GF_LOG_DEBUG, "linking missing file %s on %s", - loc->path, priv->children[child_index]->name); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_hardlink_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->link, - &oldloc, loc, NULL); - loc_wipe (&oldloc); - - return 0; + return ret; } -int -afr_sh_nameless_lookup_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - if (op_ret < 0) { - afr_sh_entry_impunge_create_file (impunge_frame, this, - (long)cookie); - } else { - afr_sh_entry_impunge_hardlink (impunge_frame, this, - (long)cookie); - } - return 0; -} -int -afr_sh_entry_impunge_check_hardlink (call_frame_t *impunge_frame, - xlator_t *this, - int child_index, struct iatt *stbuf) +static int +__afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, + char *name, inode_t *inode, int source, + unsigned char *sources, unsigned char *healed_sinks, + unsigned char *locked_on, struct afr_reply *replies) { - afr_private_t *priv = NULL; - call_frame_t *frame = NULL; - afr_local_t *impunge_local = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *impunge_sh = NULL; - afr_self_heal_t *sh = NULL; - loc_t *loc = NULL; - dict_t *xattr_req = NULL; - loc_t oldloc = {0}; - int ret = -1; - - priv = this->private; - AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, - frame, local, sh); - loc = &impunge_local->loc; - - xattr_req = dict_new (); - if (!xattr_req) - goto out; - oldloc.inode = inode_ref (loc->inode); - uuid_copy (oldloc.gfid, stbuf->ia_gfid); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_nameless_lookup_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->lookup, - &oldloc, xattr_req); - ret = 0; -out: - if (xattr_req) - dict_unref (xattr_req); - loc_wipe (&oldloc); - if (ret) - sh->impunge_done (frame, this, -1, ENOMEM); - return 0; -} + int ret = -1; -int -afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *stbuf) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - dict_t *dict = NULL; - int ret = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - - gf_log (this->name, GF_LOG_DEBUG, - "creating missing file %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - dict = dict_new (); - if (!dict) - gf_log (this->name, GF_LOG_ERROR, "Out of memory"); - - GF_ASSERT (!uuid_is_null (stbuf->ia_gfid)); - ret = afr_set_dict_gfid (dict, stbuf->ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed", - impunge_local->loc.path); - - /* - * Reason for adding GLUSTERFS_INTERNAL_FOP_KEY : - * - * Problem: - * While a brick is down in a replica pair, lets say the user creates - * one file(file-A) and a hard link to that file(h-file-A). After the - * brick comes back up, entry self-heal is attempted on parent dir of - * these two files. As part of readdir in self-heal it reads both the - * entries file-A and h-file-A for both of them it does name less lookup - * to check if there are any hardlinks already present in the - * destination brick. It finds that there are no hard links already - * present for files file-A, h-file-A. Self-heal does mknods for both - * file-A and h-file-A. This leads to file-A and h-file-A not being - * hardlinks anymore. - * - * Fix: (More like shrinking of race-window, the race itself is still - * present in posix-mknod). - * If mknod comes with the presence of GLUSTERFS_INTERNAL_FOP_KEY then - * posix_mknod checks if there are already any gfid-links and does - * link() instead of mknod. There still can be a race where two - * posix_mknods same gfid see that - * gfid-link file is not present and proceeds with mknods and result in - * two different files with same gfid. - */ - ret = dict_set_str (dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); - if (ret) - gf_log (this->name, GF_LOG_INFO, "%s: %s set failed", - impunge_local->loc.path, GLUSTERFS_INTERNAL_FOP_KEY); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->mknod, - &impunge_local->loc, - st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type), - makedev (ia_major (stbuf->ia_rdev), - ia_minor (stbuf->ia_rdev)), 0, dict); - - if (dict) - dict_unref (dict); - - return 0; + if (source < 0) + ret = __afr_selfheal_merge_dirent (frame, this, fd, name, inode, + sources, healed_sinks, + locked_on, replies); + else + ret = __afr_selfheal_heal_dirent (frame, this, fd, name, inode, + source, sources, healed_sinks, + locked_on, replies); + return ret; } - -int -afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *stbuf) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - dict_t *dict = NULL; - - int ret = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - - dict = dict_new (); - if (!dict) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - return 0; - } - - GF_ASSERT (!uuid_is_null (stbuf->ia_gfid)); - ret = afr_set_dict_gfid (dict, stbuf->ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed", - impunge_local->loc.path); - - gf_log (this->name, GF_LOG_DEBUG, - "creating missing directory %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->mkdir, - &impunge_local->loc, - st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type), - 0, dict); - - if (dict) - dict_unref (dict); - - return 0; +static int +afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *sources, + unsigned char *healed_sinks, char *name) +{ + afr_private_t *priv = NULL; + int ret = 0; + unsigned char *locked_on = NULL; + struct afr_reply *replies = NULL; + inode_t *inode = NULL; + + priv = this->private; + + locked_on = alloca0 (priv->child_count); + + replies = alloca0 (priv->child_count * sizeof(*replies)); + + ret = afr_selfheal_entrylk (frame, this, fd->inode, this->name, + name, locked_on); + { + if (ret < 2) { + ret = -ENOTCONN; + goto unlock; + } + + inode = afr_selfheal_unlocked_lookup_on (frame, fd->inode, name, + replies, locked_on); + if (!inode) { + ret = -ENOMEM; + goto unlock; + } + + ret = __afr_selfheal_entry_dirent (frame, this, fd, name, inode, + source, sources, healed_sinks, + locked_on, replies); + } +unlock: + afr_selfheal_unentrylk (frame, this, fd->inode, this->name, name, + locked_on); + if (inode) + inode_unref (inode); + return ret; } -int -afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index, const char *linkname) +static int +afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this, fd_t *fd, + int child, int source, unsigned char *sources, + unsigned char *healed_sinks) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - dict_t *dict = NULL; - struct iatt *buf = NULL; - int ret = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - - buf = &impunge_local->cont.dir_fop.buf; - - dict = dict_new (); - if (!dict) { - afr_sh_entry_call_impunge_done (impunge_frame, this, - -1, ENOMEM); - goto out; - } - - GF_ASSERT (!uuid_is_null (buf->ia_gfid)); - ret = afr_set_dict_gfid (dict, buf->ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_INFO, - "%s: dict set gfid failed", - impunge_local->loc.path); - - gf_log (this->name, GF_LOG_DEBUG, - "creating missing symlink %s -> %s on %s", - impunge_local->loc.path, linkname, - priv->children[child_index]->name); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->symlink, - linkname, &impunge_local->loc, 0, dict); - - if (dict) - dict_unref (dict); -out: - return 0; -} + int ret = 0; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + off_t offset = 0; + call_frame_t *iter_frame = NULL; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; + priv = this->private; + subvol = priv->children[child]; -int -afr_sh_entry_impunge_symlink_unlink_cbk (call_frame_t *impunge_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = -1; - int call_count = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "unlink of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - goto out; - } - - afr_sh_entry_impunge_symlink (impunge_frame, this, child_index, - impunge_sh->linkname); - - return 0; -out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); + INIT_LIST_HEAD (&entries.list); - if (call_count == 0) - afr_sh_entry_call_impunge_done (impunge_frame, this, - op_ret, op_errno); + iter_frame = afr_copy_frame (frame); + if (!iter_frame) + return -ENOMEM; - return 0; -} + while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) { + if (ret > 0) + ret = 0; + list_for_each_entry (entry, &entries.list, list) { + offset = entry->d_off; + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; -int -afr_sh_entry_impunge_symlink_unlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; + if (__is_root_gfid (fd->inode->gfid) && + !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) + continue; - priv = this->private; - impunge_local = impunge_frame->local; + ret = afr_selfheal_entry_dirent (iter_frame, this, fd, + source, sources, + healed_sinks, + entry->d_name); + AFR_STACK_RESET (iter_frame); - gf_log (this->name, GF_LOG_DEBUG, - "unlinking symlink %s with wrong target on %s", - impunge_local->loc.path, - priv->children[child_index]->name); + if (ret) + break; + } - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_symlink_unlink_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->unlink, - &impunge_local->loc, 0, NULL); + gf_dirent_free (&entries); + if (ret) + break; + } - return 0; + AFR_STACK_DESTROY (iter_frame); + return ret; } - -int -afr_sh_entry_impunge_readlink_sink_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - const char *linkname, struct iatt *sbuf, dict_t *xdata) +static int +afr_selfheal_entry_do (call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *sources, + unsigned char *healed_sinks, + struct afr_reply *locked_replies) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = -1; - int call_count = -1; - int active_src = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - active_src = impunge_sh->active_source; - - child_index = (long) cookie; - - if ((op_ret == -1) && (!afr_inode_missing(op_errno))) { - gf_log (this->name, GF_LOG_INFO, - "readlink of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - goto out; - } - - /* symlink doesn't exist on the sink */ - - if ((op_ret == -1) && (afr_inode_missing(op_errno))) { - afr_sh_entry_impunge_symlink (impunge_frame, this, - child_index, impunge_sh->linkname); - return 0; - } - - - /* symlink exists on the sink, so check if targets match */ - - if (strcmp (linkname, impunge_sh->linkname) == 0) { - /* targets match, nothing to do */ - - goto out; - } else { - /* - * Hah! Sneaky wolf in sheep's clothing! - */ - afr_sh_entry_impunge_symlink_unlink (impunge_frame, this, - child_index); - return 0; - } + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; -out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); + priv = this->private; - if (call_count == 0) - afr_sh_entry_call_impunge_done (impunge_frame, this, - op_ret, op_errno); + gf_log (this->name, GF_LOG_INFO, "performing entry selfheal on %s", + uuid_utoa (fd->inode->gfid)); - return 0; + for (i = 0; i < priv->child_count; i++) { + if (i != source && !healed_sinks[i]) + continue; + ret = afr_selfheal_entry_do_subvol (frame, this, fd, i, source, + sources, healed_sinks); + if (ret) + break; + } + return ret; } -int -afr_sh_entry_impunge_readlink_sink (call_frame_t *impunge_frame, xlator_t *this, - int child_index) +static int +__afr_selfheal_entry_finalize_source (xlator_t *this, unsigned char *sources, + unsigned char *sinks, + unsigned char *locked_on, + struct afr_reply *replies) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; + int i = 0; + afr_private_t *priv = NULL; + int source = -1; + int locked_count = 0; + int sources_count = 0; + int sinks_count = 0; - priv = this->private; - impunge_local = impunge_frame->local; + priv = this->private; - gf_log (this->name, GF_LOG_DEBUG, - "checking symlink target of %s on %s", - impunge_local->loc.path, priv->children[child_index]->name); + locked_count = AFR_COUNT (locked_on, priv->child_count); + sources_count = AFR_COUNT (sources, priv->child_count); + sinks_count = AFR_COUNT (sinks, priv->child_count); - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_sink_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->readlink, - &impunge_local->loc, 4096, NULL); + if (locked_count == sinks_count || !sources_count) { + return -1; + } - return 0; -} - - -int -afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - const char *linkname, struct iatt *sbuf, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = -1; - int call_count = -1; - int active_src = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - active_src = impunge_sh->active_source; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "readlink of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - goto out; - } - - impunge_sh->linkname = gf_strdup (linkname); - afr_sh_entry_impunge_readlink_sink (impunge_frame, this, child_index); - - return 0; + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source = i; + break; + } + } -out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) - afr_sh_entry_call_impunge_done (impunge_frame, this, - op_ret, op_errno); - - return 0; + return source; } -int -afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *stbuf) +static int +__afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, + struct afr_reply *replies, int *source_p) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int active_src = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - active_src = impunge_sh->active_source; - impunge_local->cont.dir_fop.buf = *stbuf; - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk, - (void *) (long) child_index, - priv->children[active_src], - priv->children[active_src]->fops->readlink, - &impunge_local->loc, 4096, NULL); - - return 0; -} + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + int i = 0; -int -afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, - int child_index) -{ - call_frame_t *frame = NULL; - afr_local_t *impunge_local = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *impunge_sh = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - ia_type_t type = IA_INVAL; - int active_src = 0; - struct iatt *buf = NULL; - - AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, - frame, local, sh); - active_src = impunge_sh->active_source; - afr_update_loc_gfids (&impunge_local->loc, &impunge_sh->entrybuf, - &impunge_sh->parentbuf); - - buf = &impunge_sh->entrybuf; - type = buf->ia_type; - - switch (type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - case IA_IFLNK: - afr_sh_entry_impunge_check_hardlink (impunge_frame, this, - child_index, buf); - break; - case IA_IFDIR: - afr_sh_entry_impunge_mkdir (impunge_frame, this, - child_index, buf); - break; - default: - gf_log (this->name, GF_LOG_ERROR, - "%s has unknown file type on %s: 0%o", - impunge_local->loc.path, - priv->children[active_src]->name, type); - sh->impunge_done (frame, this, -1, EINVAL); - break; - } - - return 0; -} + priv = this->private; -int -afr_sh_entry_impunge_create_file (call_frame_t *impunge_frame, xlator_t *this, - int child_index) -{ - call_frame_t *frame = NULL; - afr_local_t *impunge_local = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *impunge_sh = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - ia_type_t type = IA_INVAL; - int active_src = 0; - struct iatt *buf = NULL; - - AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, - frame, local, sh); - active_src = impunge_sh->active_source; - buf = &impunge_sh->entrybuf; - type = buf->ia_type; - - switch (type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - afr_sh_entry_impunge_mknod (impunge_frame, this, - child_index, buf); - break; - case IA_IFLNK: - afr_sh_entry_impunge_readlink (impunge_frame, this, - child_index, buf); - break; - default: - gf_log (this->name, GF_LOG_ERROR, - "%s has unknown file type on %s: 0%o", - impunge_local->loc.path, - priv->children[active_src]->name, type); - sh->impunge_done (frame, this, -1, EINVAL); - break; - } - - return 0; -} + ret = afr_selfheal_unlocked_discover (frame, fd->inode, fd->inode->gfid, + replies); + if (ret) + return ret; -gf_boolean_t -afr_sh_need_recreate (afr_self_heal_t *impunge_sh, unsigned int child, - unsigned int child_count) -{ - gf_boolean_t recreate = _gf_false; + ret = afr_selfheal_find_direction (frame, this, replies, + AFR_ENTRY_TRANSACTION, + locked_on, sources, sinks); + if (ret) + return ret; - GF_ASSERT (impunge_sh->child_errno); + source = __afr_selfheal_entry_finalize_source (this, sources, sinks, + locked_on, replies); + if (source < 0) { + /* If source is < 0 (typically split-brain), we perform a + conservative merge of entries rather than erroring out */ + } + *source_p = source; - if (child == impunge_sh->active_source) - goto out; + for (i = 0; i < priv->child_count; i++) + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). - if (IA_IFLNK == impunge_sh->entrybuf.ia_type) { - recreate = _gf_true; - goto out; - } + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + healed_sinks[i] = sinks[i] && locked_on[i]; - if (impunge_sh->child_errno[child] == ENOENT) - recreate = _gf_true; -out: - return recreate; + return ret; } -unsigned int -afr_sh_recreate_count (afr_self_heal_t *impunge_sh, int *sources, - unsigned int child_count) -{ - int count = 0; - int i = 0; - - for (i = 0; i < child_count; i++) { - if (afr_sh_need_recreate (impunge_sh, i, child_count)) - count++; - } - - return count; -} -int -afr_sh_entry_call_impunge_recreate (call_frame_t *impunge_frame, - xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - unsigned int recreate_count = 0; - int i = 0; - int active_src = 0; - - priv = this->private; - AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, - frame, local, sh); - active_src = impunge_sh->active_source; - impunge_sh->entrybuf = impunge_sh->buf[active_src]; - impunge_sh->parentbuf = impunge_sh->parentbufs[active_src]; - recreate_count = afr_sh_recreate_count (impunge_sh, sh->sources, - priv->child_count); - if (!recreate_count) { - afr_sh_entry_call_impunge_done (impunge_frame, this, 0, 0); - goto out; - } - impunge_local->call_count = recreate_count; - for (i = 0; i < priv->child_count; i++) { - if (!impunge_local->child_up[i]) { - impunge_sh->child_errno[i] = ENOTCONN; - continue; - } - if (!afr_sh_need_recreate (impunge_sh, i, priv->child_count)) { - impunge_sh->child_errno[i] = EEXIST; - continue; - } - } - for (i = 0; i < priv->child_count; i++) { - if (!afr_sh_need_recreate (impunge_sh, i, priv->child_count)) - continue; - (void)afr_sh_entry_impunge_create (impunge_frame, this, i); - recreate_count--; - } - GF_ASSERT (!recreate_count); +static int +__afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on) +{ + afr_private_t *priv = NULL; + int ret = -1; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *data_lock = NULL; + unsigned char *healed_sinks = NULL; + struct afr_reply *locked_replies = NULL; + int source = -1; + + priv = this->private; + + sources = alloca0 (priv->child_count); + sinks = alloca0 (priv->child_count); + healed_sinks = alloca0 (priv->child_count); + data_lock = alloca0 (priv->child_count); + + locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); + + ret = afr_selfheal_entrylk (frame, this, fd->inode, this->name, NULL, + data_lock); + { + if (ret < 2) { + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_entry_prepare (frame, this, fd, data_lock, + sources, sinks, healed_sinks, + locked_replies, &source); + } +unlock: + afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL, + data_lock); + if (ret < 0) + goto out; + + ret = afr_selfheal_entry_do (frame, this, fd, source, sources, + healed_sinks, locked_replies); + if (ret) + goto out; + + ret = afr_selfheal_undo_pending (frame, this, fd->inode, sources, sinks, + healed_sinks, AFR_ENTRY_TRANSACTION, + locked_replies, data_lock); out: - return 0; + return ret; } -void -afr_sh_entry_common_lookup_done (call_frame_t *impunge_frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - unsigned int gfid_miss_count = 0; - unsigned int children_up_count = 0; - uuid_t gfid = {0}; - int active_src = 0; - - priv = this->private; - AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, - frame, local, sh); - active_src = impunge_sh->active_source; - - if (op_ret < 0) - goto done; - if (impunge_sh->child_errno[active_src]) { - op_ret = -1; - op_errno = impunge_sh->child_errno[active_src]; - goto done; - } - - gfid_miss_count = afr_gfid_missing_count (this->name, - impunge_sh->success_children, - impunge_sh->buf, priv->child_count, - impunge_local->loc.path); - children_up_count = afr_up_children_count (impunge_local->child_up, - priv->child_count); - if ((gfid_miss_count == children_up_count) && - (children_up_count < priv->child_count)) { - op_ret = -1; - op_errno = ENODATA; - gf_log (this->name, GF_LOG_ERROR, "Not all children are up, " - "gfid should not be assigned in this state for %s", - impunge_local->loc.path); - goto done; - } - - if (gfid_miss_count) { - afr_update_gfid_from_iatts (gfid, impunge_sh->buf, - impunge_sh->success_children, - priv->child_count); - if (uuid_is_null (gfid)) { - sh->entries_skipped = _gf_true; - gf_log (this->name, GF_LOG_INFO, "%s: Skipping entry " - "self-heal because of gfid absence", - impunge_local->loc.path); - goto done; - } - afr_sh_common_lookup (impunge_frame, this, &impunge_local->loc, - afr_sh_entry_common_lookup_done, gfid, - AFR_LOOKUP_FAIL_CONFLICTS | - AFR_LOOKUP_FAIL_MISSING_GFIDS, - NULL); - } else { - afr_sh_entry_call_impunge_recreate (impunge_frame, this); - } - return; -done: - afr_sh_entry_call_impunge_done (impunge_frame, this, - op_ret, op_errno); - return; -} -int -afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this, - gf_dirent_t *entry) +static fd_t * +afr_selfheal_data_opendir (xlator_t *this, inode_t *inode) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *impunge_sh = NULL; - int ret = -1; - call_frame_t *impunge_frame = NULL; - afr_local_t *impunge_local = NULL; - int active_src = 0; - int op_errno = 0; - int op_ret = -1; - - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - sh->impunge_done = afr_sh_entry_impunge_entry_done; - - if (can_skip_entry_self_heal (entry->d_name, &local->loc)) { - op_ret = 0; - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "inspecting existence of %s under %s", - entry->d_name, local->loc.path); - - ret = afr_impunge_frame_create (frame, this, active_src, - &impunge_frame); - if (ret) { - op_errno = -ret; - goto out; - } - - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - ret = afr_build_child_loc (this, &impunge_local->loc, &local->loc, - entry->d_name); - loc_copy (&impunge_sh->parent_loc, &local->loc); - if (ret != 0) { - op_errno = ENOMEM; - goto out; - } - - afr_sh_common_lookup (impunge_frame, this, &impunge_local->loc, - afr_sh_entry_common_lookup_done, NULL, - AFR_LOOKUP_FAIL_CONFLICTS, NULL); - - op_ret = 0; -out: - if (ret) { - if (impunge_frame) - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, op_ret, op_errno); - } + loc_t loc = {0,}; + int ret = 0; + fd_t *fd = NULL; - return 0; -} + fd = fd_create (inode, 0); + if (!fd) + return NULL; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); -int -afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - gf_dirent_t *entry = NULL; - off_t last_offset = 0; - int active_src = 0; - int entry_count = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - - if (op_ret <= 0) { - if (op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "readdir of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } else { - gf_log (this->name, GF_LOG_TRACE, - "readdir of %s on subvolume %s complete", - local->loc.path, - priv->children[active_src]->name); - } - - afr_sh_entry_impunge_all (frame, this); - return 0; - } - - list_for_each_entry (entry, &entries->list, list) { - last_offset = entry->d_off; - entry_count++; - } - - gf_log (this->name, GF_LOG_DEBUG, - "readdir'ed %d entries from %s", - entry_count, priv->children[active_src]->name); - - sh->offset = last_offset; - local->call_count = entry_count; - - list_for_each_entry (entry, &entries->list, list) { - afr_sh_entry_impunge_entry (frame, this, entry); - } - - return 0; -} + ret = syncop_opendir (this, &loc, fd); + if (ret) { + fd_unref (fd); + fd = NULL; + } else { + fd_bind (fd); + } + loc_wipe (&loc); -int -afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int32_t active_src = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - active_src = sh->active_source; - gf_log (this->name, GF_LOG_DEBUG, "%s: readdir from offset %zd", - local->loc.path, sh->offset); - - STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk, - priv->children[active_src], - priv->children[active_src]->fops->readdirp, - sh->healing_fd, sh->block_size, sh->offset, NULL); - - return 0; + return fd; } -int -afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int active_src = -1; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sh->offset = 0; - - active_src = next_active_source (frame, this, sh->active_source); - sh->active_source = active_src; - - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - afr_sh_entry_finish (frame, this); - return 0; - } - - if (active_src == -1) { - /* completed creating missing files on all subvolumes */ - afr_sh_entry_erase_pending (frame, this); - return 0; - } - - gf_log (this->name, GF_LOG_TRACE, - "impunging entries of %s on %s to other sinks", - local->loc.path, priv->children[active_src]->name); - - afr_sh_entry_impunge_subvol (frame, this); - - return 0; -} - int -afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - /* TODO: some of the open's might fail. - In that case, modify cleanup fn to send flush on those - fd's which are already open */ - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "opendir of %s failed on child %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - afr_sh_entry_finish (frame, this); - return 0; - } - gf_log (this->name, GF_LOG_TRACE, - "fd for %s opened, commencing sync", - local->loc.path); - - sh->active_source = -1; - afr_sh_entry_expunge_all (frame, this); - } - - return 0; -} - - -int -afr_sh_entry_open (call_frame_t *frame, xlator_t *this) -{ - int i = 0; - int call_count = 0; - - int source = -1; - int *sources = NULL; + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + fd_t *fd = NULL; + int ret = 0; - fd_t *fd = NULL; + priv = this->private; - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t *sh = NULL; + fd = afr_selfheal_data_opendir (this, inode); + if (!fd) + return -EIO; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + locked_on = alloca0 (priv->child_count); - source = local->self_heal.source; - sources = local->self_heal.sources; + ret = afr_selfheal_tryentrylk (frame, this, inode, priv->sh_domain, NULL, + locked_on); + { + if (ret < 2) { + /* Either less than two subvols available, or another + selfheal (from another server) is in progress. Skip + for now in any case there isn't anything to do. + */ + ret = -ENOTCONN; + goto unlock; + } - sh->block_size = priv->sh_readdir_size; - sh->offset = 0; + ret = __afr_selfheal_entry (frame, this, fd, locked_on); + } +unlock: + afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL, locked_on); - call_count = sh->active_sinks; - if (source != -1) - call_count++; + if (fd) + fd_unref (fd); - local->call_count = call_count; - - fd = fd_create (local->loc.inode, frame->root->pid); - sh->healing_fd = fd; - - if (source != -1) { - gf_log (this->name, GF_LOG_TRACE, - "opening directory %s on subvolume %s (source)", - local->loc.path, priv->children[source]->name); - - /* open source */ - STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, - (void *) (long) source, - priv->children[source], - priv->children[source]->fops->opendir, - &local->loc, fd, NULL); - call_count--; - } - - /* open sinks */ - for (i = 0; i < priv->child_count; i++) { - if (sources[i] || !local->child_up[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "opening directory %s on subvolume %s (sink)", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->opendir, - &local->loc, fd, NULL); - - if (!--call_count) - break; - } - - return 0; -} - - -int -afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - afr_sh_mark_source_sinks (frame, this); - if (source != -1) - sh->success[source] = 1; - - if (sh->active_sinks == 0) { - gf_log (this->name, GF_LOG_TRACE, - "no active sinks for self-heal on dir %s", - local->loc.path); - afr_sh_entry_finish (frame, this); - return 0; - } - if (source == -1 && sh->active_sinks < 2) { - gf_log (this->name, GF_LOG_TRACE, - "cannot sync with 0 sources and 1 sink on dir %s", - local->loc.path); - afr_sh_entry_finish (frame, this); - return 0; - } - - if (source != -1) - gf_log (this->name, GF_LOG_DEBUG, - "self-healing directory %s from subvolume %s to " - "%d other", - local->loc.path, priv->children[source]->name, - sh->active_sinks); - else - gf_log (this->name, GF_LOG_DEBUG, - "no active sources for %s found. " - "merging all entries as a conservative decision", - local->loc.path); - - sh->actual_sh_started = _gf_true; - afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); - afr_sh_entry_open (frame, this); - - return 0; -} - - -void -afr_sh_entry_fix (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - int nsources = 0; - int32_t subvol_status = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (op_ret < 0) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_set_error (sh, op_errno); - afr_sh_entry_finish (frame, this); - goto out; - } - - if (sh->forced_merge) { - sh->source = -1; - goto heal; - } - - nsources = afr_build_sources (this, sh->xattr, sh->buf, - sh->pending_matrix, sh->sources, - sh->success_children, - AFR_ENTRY_TRANSACTION, &subvol_status, - _gf_true); - if ((subvol_status & ALL_FOOLS) || - (subvol_status & SPLIT_BRAIN)) { - gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative " - "merge", local->loc.path); - source = -1; - memset (sh->sources, 0, - sizeof (*sh->sources) * priv->child_count); - } else if (nsources == 0) { - gf_log (this->name, GF_LOG_TRACE, - "No self-heal needed for %s", - local->loc.path); - - afr_sh_entry_finish (frame, this); - return; - } else { - source = afr_sh_select_source (sh->sources, priv->child_count); - } - - sh->source = source; - - afr_reset_children (sh->fresh_children, priv->child_count); - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); - if (sh->source >= 0) - afr_inode_set_read_ctx (this, sh->inode, sh->source, - sh->fresh_children); - -heal: - afr_sh_entry_sync_prepare (frame, this); -out: - return; -} - -int -afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "Non Blocking entrylks " - "failed for %s.", local->loc.path); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_entry_done (frame, this); - } else { - - gf_log (this->name, GF_LOG_DEBUG, "Non Blocking entrylks done " - "for %s. Proceeding to FOP", local->loc.path); - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_entry_fix, NULL, - AFR_LOOKUP_FAIL_CONFLICTS | - AFR_LOOKUP_FAIL_MISSING_GFIDS, - NULL); - } - - return 0; -} - -int -afr_self_heal_entry (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sh->sh_type_in_action = AFR_SELF_HEAL_ENTRY; - - if (local->self_heal.do_entry_self_heal && priv->entry_self_heal) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); - afr_sh_entrylk (frame, this, &local->loc, NULL, - afr_sh_post_nonblocking_entry_cbk); - } else { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to completion on %s", - local->loc.path); - afr_sh_entry_done (frame, this); - } - - return 0; + return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index fd5da6cfd33..b31a33237f5 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,763 +8,274 @@ cases as published by the Free Software Foundation. */ -#include <libgen.h> -#include <unistd.h> -#include <fnmatch.h> -#include <sys/time.h> -#include <stdlib.h> -#include <signal.h> #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" #endif -#include "glusterfs.h" #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" - - -int -afr_sh_metadata_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - afr_sh_reset (frame, this); - if (IA_ISDIR (sh->type)) { - gf_log (this->name, GF_LOG_DEBUG, - "proceeding to entry check on %s", - local->loc.path); - afr_self_heal_entry (frame, this); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "proceeding to data check on %s", - local->loc.path); - afr_self_heal_data (frame, this); - } - - return 0; -} - -int -afr_sh_inode_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->lock_cbk = afr_sh_metadata_done; - afr_unlock (frame, this); - - return 0; -} - -int -afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this) -{ - afr_sh_inode_unlock (frame, this); - - return 0; -} - -int -afr_sh_metadata_fail (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_metadata_finish (frame, this); - return 0; -} - -int -afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr, dict_t *xdata) -{ - afr_local_t *local = NULL; - int call_count = 0; - long i = 0; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - i = (long)cookie; - - if ((!IA_ISREG (sh->buf[sh->source].ia_type)) && - (!IA_ISDIR (sh->buf[sh->source].ia_type))) { - afr_children_add_child (sh->fresh_children, i, - priv->child_count); - } - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if ((!IA_ISREG (sh->buf[sh->source].ia_type)) && - (!IA_ISDIR (sh->buf[sh->source].ia_type))) { - afr_inode_set_read_ctx (this, sh->inode, sh->source, - sh->fresh_children); - } - afr_sh_metadata_finish (frame, this); - } - - return 0; -} - -int -afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this) -{ - afr_sh_erase_pending (frame, this, AFR_METADATA_TRANSACTION, - afr_sh_metadata_erase_pending_cbk, - afr_sh_metadata_finish); - return 0; -} - - -int -afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "setting attributes failed for %s on %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - sh->success[child_index] = 0; - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (local->xattr_req) { - dict_unref (local->xattr_req); - local->xattr_req = NULL; - } - afr_sh_metadata_erase_pending (frame, this); - } - - return 0; -} - - -int -afr_sh_metadata_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop, dict_t *xdata) -{ - afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno, xdata); - - return 0; -} - - -int -afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno, xdata); - - return 0; -} - -int -afr_sh_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - dict_t *xdata) -{ - int i = 0; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - priv = this->private; - local = frame->local; - - if (op_ret < 0) { - afr_sh_metadata_sync_cbk (frame, cookie, - this, -1, op_errno, xdata); - goto out; - } - - i = (long) cookie; - - STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, local->xattr_req, 0, NULL); - - out: - return 0; -} - -inline void -afr_prune_special_keys (dict_t *xattr_dict) -{ - dict_del (xattr_dict, GF_SELINUX_XATTR_KEY); -} - -inline void -afr_prune_pending_keys (dict_t *xattr_dict, afr_private_t *priv) -{ - int i = 0; - - for (; i < priv->child_count; i++) { - dict_del (xattr_dict, priv->pending_key[i]); - } -} - -int -afr_sh_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr, - dict_t *xdata) -{ - int i = 0; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - priv = this->private; - local = frame->local; - - if (op_ret < 0) { - afr_sh_metadata_sync_cbk (frame, cookie, - this, -1, op_errno, xdata); - goto out; - } - - afr_prune_pending_keys (xattr, priv); - - afr_prune_special_keys (xattr); - - i = (long) cookie; +#include "byte-order.h" - /* send removexattr in bulk via xdata */ - STACK_WIND_COOKIE (frame, afr_sh_removexattr_cbk, - cookie, - priv->children[i], - priv->children[i]->fops->removexattr, - &local->loc, "", xattr); - out: - return 0; -} +#define AFR_HEAL_ATTR (GF_SET_ATTR_UID|GF_SET_ATTR_GID|GF_SET_ATTR_MODE) int -afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) +afr_selfheal_metadata_do (call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, unsigned char *healed_sinks, + struct afr_reply *locked_replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - int active_sinks = 0; - int call_count = 0; - int i = 0; - - struct iatt stbuf = {0,}; - int32_t valid = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - active_sinks = sh->active_sinks; - - /* - * 2 calls per sink - setattr, setxattr - */ - if (xattr) { - call_count = active_sinks * 2; - local->xattr_req = dict_ref (xattr); - } else - call_count = active_sinks; - - local->call_count = call_count; - - stbuf.ia_atime = sh->buf[source].ia_atime; - stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec; - stbuf.ia_mtime = sh->buf[source].ia_mtime; - stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec; - - stbuf.ia_uid = sh->buf[source].ia_uid; - stbuf.ia_gid = sh->buf[source].ia_gid; - - stbuf.ia_type = sh->buf[source].ia_type; - stbuf.ia_prot = sh->buf[source].ia_prot; - - valid = GF_SET_ATTR_MODE | - GF_SET_ATTR_UID | GF_SET_ATTR_GID | - GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - - for (i = 0; i < priv->child_count; i++) { - if (call_count == 0) { - break; - } - if (sh->sources[i] || !local->child_up[i]) - continue; - - gf_log (this->name, GF_LOG_DEBUG, - "self-healing metadata of %s from %s to %s", - local->loc.path, priv->children[source]->name, - priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_setattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setattr, - &local->loc, &stbuf, valid, NULL); - - call_count--; - - if (!xattr) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_getxattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->getxattr, - &local->loc, NULL, NULL); - call_count--; - } - - return 0; + int ret = -1; + loc_t loc = {0,}; + dict_t *xattr = NULL; + dict_t *old_xattr = NULL; + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; + + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); + + gf_log (this->name, GF_LOG_INFO, "performing metadata selfheal on %s", + uuid_utoa (inode->gfid)); + + ret = syncop_getxattr (priv->children[source], &loc, &xattr, NULL); + if (ret < 0) { + loc_wipe (&loc); + return -EIO; + } + + afr_filter_xattrs (xattr); + dict_del (xattr, GF_SELINUX_XATTR_KEY); + + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + + ret = syncop_setattr (priv->children[i], &loc, + &locked_replies[source].poststat, + AFR_HEAL_ATTR, NULL, NULL); + if (ret) + healed_sinks[i] = 0; + + old_xattr = NULL; + ret = syncop_getxattr (priv->children[i], &loc, &old_xattr, 0); + if (old_xattr) { + dict_del (old_xattr, GF_SELINUX_XATTR_KEY); + afr_filter_xattrs (old_xattr); + ret = syncop_removexattr (priv->children[i], &loc, "", + old_xattr); + } + + ret = syncop_setxattr (priv->children[i], &loc, xattr, 0); + if (ret) + healed_sinks[i] = 0; + } + + loc_wipe (&loc); + if (xattr) + dict_unref (xattr); + + return 0; } -int -afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr, - dict_t *xdata) +/* + * Look for mismatching uid/gid or mode even if xattrs don't say so, and + * pick one arbitrarily as winner. + */ + +static int +__afr_selfheal_metadata_finalize_source (xlator_t *this, unsigned char *sources, + unsigned char *sinks, + unsigned char *locked_on, + struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "getxattr of %s failed on subvolume %s (%s). proceeding without xattr", - local->loc.path, priv->children[source]->name, - strerror (op_errno)); - - afr_sh_metadata_sync (frame, this, NULL); - } else { - afr_prune_pending_keys (xattr, priv); - afr_sh_metadata_sync (frame, this, xattr); - } - - return 0; + int i = 0; + afr_private_t *priv = NULL; + struct iatt first = {0, }; + int source = -1; + int locked_count = 0; + int sources_count = 0; + int sinks_count = 0; + + priv = this->private; + + locked_count = AFR_COUNT (locked_on, priv->child_count); + sources_count = AFR_COUNT (sources, priv->child_count); + sinks_count = AFR_COUNT (sinks, priv->child_count); + + if (locked_count == sinks_count || !sources_count) { + if (!priv->metadata_splitbrain_forced_heal) { + return -EIO; + } + /* Metadata split brain, select one subvol + arbitrarily */ + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i] && sinks[i]) { + sources[i] = 1; + sinks[i] = 0; + break; + } + } + } + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (source == -1) { + source = i; + first = replies[i].poststat; + } + } + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (!IA_EQUAL (first, replies[i].poststat, type) || + !IA_EQUAL (first, replies[i].poststat, uid) || + !IA_EQUAL (first, replies[i].poststat, gid) || + !IA_EQUAL (first, replies[i].poststat, prot)) { + sources[i] = 0; + sinks[i] = 1; + } + } + + return source; } -static void -afr_set_metadata_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, - xlator_t *this) -{ - afr_private_t *priv = NULL; - int i = 0; - char num[1024] = {0}; - size_t len = 0; - char *string = NULL; - size_t off = 0; - char *source_child = " from source %s to"; - char *format = " %s, "; - char *string_msg = " metadata self heal"; - char *pending_matrix_str = NULL; - int down_child_present = 0; - int unknown_child_present = 0; - char *down_subvol_1 = " down subvolume is "; - char *unknown_subvol_1 = " unknown subvolume is"; - char *down_subvol_2 = " down subvolumes are "; - char *unknown_subvol_2 = " unknown subvolumes are "; - int down_count = 0; - int unknown_count = 0; - - priv = this->private; - - pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix, - this); - - if (!pending_matrix_str) - pending_matrix_str = ""; - - len += snprintf (num, sizeof (num), "%s", string_msg); - - for (i = 0; i < priv->child_count; i++) { - if ((sh->source == i) && (local->child_up[i] == 1)) { - len += snprintf (num, sizeof (num), source_child, - priv->children[i]->name); - } else if ((local->child_up[i] == 1) && (sh->sources[i] == 0)) { - len += snprintf (num, sizeof (num), format, - priv->children[i]->name); - } else if (local->child_up[i] == 0) { - len += snprintf (num, sizeof (num), format, - priv->children[i]->name); - if (!down_child_present) - down_child_present = 1; - down_count++; - } else if (local->child_up[i] == -1) { - len += snprintf (num, sizeof (num), format, - priv->children[i]->name); - if (!unknown_child_present) - unknown_child_present = 1; - unknown_count++; - } - } - - if (down_child_present) { - if (down_count > 1) { - len += snprintf (num, sizeof (num), "%s", - down_subvol_2); - } else { - len += snprintf (num, sizeof (num), "%s", - down_subvol_1); - } - } - if (unknown_child_present) { - if (unknown_count > 1) { - len += snprintf (num, sizeof (num), "%s", - unknown_subvol_2); - } else { - len += snprintf (num, sizeof (num), "%s", - unknown_subvol_1); - } - } - - len ++; - - string = GF_CALLOC (len, sizeof (char), gf_common_mt_char); - if (!string) - return; - - off += snprintf (string + off, len - off, "%s", string_msg); - for (i=0; i < priv->child_count; i++) { - if ((sh->source == i) && (local->child_up[i] == 1)) - off += snprintf (string + off, len - off, source_child, - priv->children[i]->name); - } - - for (i = 0; i < priv->child_count; i++) { - if ((local->child_up[i] == 1)&& (sh->sources[i] == 0)) - off += snprintf (string + off, len - off, format, - priv->children[i]->name); - } - - if (down_child_present) { - if (down_count > 1) { - off += snprintf (string + off, len - off, "%s", - down_subvol_2); - } else { - off += snprintf (string + off, len - off, "%s", - down_subvol_1); - } - } - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] == 0) - off += snprintf (string + off, len - off, format, - priv->children[i]->name); - } - - if (unknown_child_present) { - if (unknown_count > 1) { - off += snprintf (string + off, len - off, "%s", - unknown_subvol_2); - } else { - off += snprintf (string + off, len - off, "%s", - unknown_subvol_1); - } - } - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] == -1) - off += snprintf (string + off, len - off, format, - priv->children[i]->name); - } - - gf_asprintf (&sh->metadata_sh_info, "%s metadata %s,", string, - pending_matrix_str); - - if (pending_matrix_str && strcmp (pending_matrix_str, "")) - GF_FREE (pending_matrix_str); - - if (string && strcmp (string, "")) - GF_FREE (string); -} -int -afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) +static int +__afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *locked_on, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, + struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - afr_sh_mark_source_sinks (frame, this); - if (sh->active_sinks == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "no active sinks for performing self-heal on file %s", - local->loc.path); - afr_sh_metadata_finish (frame, this); - return 0; - } - - gf_log (this->name, GF_LOG_TRACE, - "syncing metadata of %s from subvolume %s to %d active sinks", - local->loc.path, priv->children[source]->name, - sh->active_sinks); - - sh->actual_sh_started = _gf_true; - afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); - afr_set_metadata_sh_info_str (local, sh, this); - STACK_WIND (frame, afr_sh_metadata_getxattr_cbk, - priv->children[source], - priv->children[source]->fops->getxattr, - &local->loc, NULL, NULL); - - return 0; + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; + + ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid, + replies); + if (ret) + return ret; + + ret = afr_selfheal_find_direction (frame, this, replies, + AFR_METADATA_TRANSACTION, + locked_on, sources, sinks); + if (ret) + return ret; + + source = __afr_selfheal_metadata_finalize_source (this, sources, sinks, + locked_on, replies); + if (source < 0) + return -EIO; + + for (i = 0; i < priv->child_count; i++) + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + healed_sinks[i] = sinks[i] && locked_on[i]; + + return source; } -void -afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) +static int +__afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int nsources = 0; - int source = 0; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (op_ret < 0) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_set_error (sh, op_errno); - afr_sh_metadata_finish (frame, this); - goto out; - } - nsources = afr_build_sources (this, sh->xattr, sh->buf, - sh->pending_matrix, sh->sources, - sh->success_children, - AFR_METADATA_TRANSACTION, NULL, _gf_false); - if ((nsources == -1) - && (priv->favorite_child != -1) - && (sh->child_errno[priv->favorite_child] == 0)) { - - gf_log (this->name, GF_LOG_WARNING, - "Picking favorite child %s as authentic source to resolve conflicting metadata of %s", - priv->children[priv->favorite_child]->name, - local->loc.path); - - sh->sources[priv->favorite_child] = 1; - - nsources = afr_sh_source_count (sh->sources, - priv->child_count); - } - - if (nsources == -1) { - afr_sh_print_split_brain_log (sh->pending_matrix, this, - local->loc.path); - afr_set_split_brain (this, sh->inode, SPB, DONT_KNOW); - afr_sh_metadata_fail (frame, this); - goto out; - } - - afr_set_split_brain (this, sh->inode, NO_SPB, DONT_KNOW); - if (nsources == 0) { - gf_log (this->name, GF_LOG_TRACE, - "No self-heal needed for %s", - local->loc.path); - - afr_sh_metadata_finish (frame, this); - goto out; - } - - source = afr_sh_select_source (sh->sources, priv->child_count); - - if (source == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "No active sources found."); - - afr_sh_metadata_finish (frame, this); - goto out; - } - - sh->source = source; - - /* detect changes not visible through pending flags -- JIC */ - for (i = 0; i < priv->child_count; i++) { - if (i == source || sh->child_errno[i]) - continue; - - if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source])) - sh->sources[i] = 0; - - if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source])) - sh->sources[i] = 0; - } - - if ((!IA_ISREG (sh->buf[source].ia_type)) && - (!IA_ISDIR (sh->buf[source].ia_type))) { - afr_reset_children (sh->fresh_children, priv->child_count); - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); - afr_inode_set_read_ctx (this, sh->inode, sh->source, - sh->fresh_children); - } - - if (sh->do_metadata_self_heal && priv->metadata_self_heal) - afr_sh_metadata_sync_prepare (frame, this); - else - afr_sh_metadata_finish (frame, this); + afr_private_t *priv = NULL; + int ret = -1; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *data_lock = NULL; + unsigned char *healed_sinks = NULL; + struct afr_reply *locked_replies = NULL; + int source = -1; + + priv = this->private; + + sources = alloca0 (priv->child_count); + sinks = alloca0 (priv->child_count); + healed_sinks = alloca0 (priv->child_count); + data_lock = alloca0 (priv->child_count); + + locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); + + ret = afr_selfheal_inodelk (frame, this, inode, this->name, + LLONG_MAX - 1, 0, data_lock); + { + if (ret < 2) { + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_metadata_prepare (frame, this, inode, data_lock, + sources, sinks, healed_sinks, + locked_replies); + if (ret < 0) + goto unlock; + + source = ret; + ret = 0; + } +unlock: + afr_selfheal_uninodelk (frame, this, inode, this->name, + LLONG_MAX -1, 0, data_lock); + if (ret < 0) + goto out; + + ret = afr_selfheal_metadata_do (frame, this, inode, source, healed_sinks, + locked_replies); + if (ret) + goto out; + + ret = afr_selfheal_undo_pending (frame, this, inode, sources, sinks, + healed_sinks, AFR_METADATA_TRANSACTION, + locked_replies, data_lock); out: - return; -} - -int -afr_sh_metadata_post_nonblocking_inodelk_cbk (call_frame_t *frame, - xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, "Non Blocking metadata " - "inodelks failed for %s.", local->loc.path); - gf_log (this->name, GF_LOG_DEBUG, "Metadata self-heal " - "failed for %s.", local->loc.path); - afr_sh_metadata_done (frame, this); - } else { - - gf_log (this->name, GF_LOG_DEBUG, "Non Blocking metadata " - "inodelks done for %s. Proceeding to FOP", - local->loc.path); - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_metadata_fix, NULL, - AFR_LOOKUP_FAIL_CONFLICTS | - AFR_LOOKUP_FAIL_MISSING_GFIDS, - NULL); - } - - return 0; + return ret; } -int -afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->domain = this->name; - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - int_lock->transaction_lk_type = AFR_SELFHEAL_LK; - int_lock->selfheal_lk_type = AFR_METADATA_SELF_HEAL_LK; - - afr_set_lock_number (frame, this); - - inodelk->flock.l_start = LLONG_MAX - 1; - inodelk->flock.l_len = 0; - inodelk->flock.l_type = F_WRLCK; - int_lock->lock_cbk = afr_sh_metadata_post_nonblocking_inodelk_cbk; - - afr_nonblocking_inodelk (frame, this); - - return 0; -} - -gf_boolean_t -afr_can_start_metadata_self_heal (afr_self_heal_t *sh, afr_private_t *priv) -{ - if (sh->force_confirm_spb) - return _gf_true; - if (sh->do_metadata_self_heal && priv->metadata_self_heal) - return _gf_true; - return _gf_false; -} int -afr_self_heal_metadata (call_frame_t *frame, xlator_t *this) +afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode) { - afr_local_t *local = NULL; - afr_private_t *priv = this->private; - afr_self_heal_t *sh = &local->self_heal; - - local = frame->local; - sh = &local->self_heal; - sh->sh_type_in_action = AFR_SELF_HEAL_METADATA; - - if (afr_can_start_metadata_self_heal (sh, priv)) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); - afr_sh_metadata_lock (frame, this); - } else { - afr_sh_metadata_done (frame, this); - } - - return 0; + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + int ret = 0; + + priv = this->private; + + locked_on = alloca0 (priv->child_count); + + ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain, 0, 0, + locked_on); + { + if (ret < 2) { + /* Either less than two subvols available, or another + selfheal (from another server) is in progress. Skip + for now in any case there isn't anything to do. + */ + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_metadata (frame, this, inode, locked_on); + } +unlock: + afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on); + + return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c new file mode 100644 index 00000000000..ce80b8da393 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-name.c @@ -0,0 +1,457 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "afr.h" +#include "afr-self-heal.h" + + +int +__afr_selfheal_assign_gfid (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname, inode_t *inode, + struct afr_reply *replies, int gfid_idx) +{ + int i = 0; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + int ret = 0; + loc_t loc = {0, }; + + priv = this->private; + + uuid_copy (parent->gfid, pargfid); + + xdata = dict_new (); + if (!xdata) { + return -ENOMEM; + } + + ret = dict_set_static_bin (xdata, "gfid-req", + replies[gfid_idx].poststat.ia_gfid, 16); + if (ret) { + dict_destroy (xdata); + return -ENOMEM; + } + + loc.parent = inode_ref (parent); + loc.inode = inode_ref (inode); + uuid_copy (loc.pargfid, pargfid); + loc.name = bname; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].op_ret == 0 || replies[i].op_errno != ENODATA) + continue; + + ret = syncop_lookup (priv->children[i], &loc, xdata, 0, 0, 0); + } + + loc_wipe (&loc); + dict_unref (xdata); + + return ret; +} + + +int +__afr_selfheal_name_impunge (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname, inode_t *inode, + struct afr_reply *replies, int gfid_idx) +{ + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; + + priv = this->private; + + uuid_copy (parent->gfid, pargfid); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (uuid_compare (replies[i].poststat.ia_gfid, + replies[gfid_idx].poststat.ia_gfid) == 0) + continue; + + ret |= afr_selfheal_recreate_entry (frame, this, i, gfid_idx, + parent, bname, inode, replies); + } + + return ret; +} + + +int +__afr_selfheal_name_expunge (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname, inode_t *inode, + struct afr_reply *replies) +{ + loc_t loc = {0, }; + int i = 0; + afr_private_t *priv = NULL; + char g[64]; + int ret = 0; + + priv = this->private; + + loc.parent = inode_ref (parent); + uuid_copy (loc.pargfid, pargfid); + loc.name = bname; + loc.inode = inode_ref (inode); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (replies[i].op_ret) + continue; + + switch (replies[i].poststat.ia_type) { + case IA_IFDIR: + gf_log (this->name, GF_LOG_WARNING, + "expunging dir %s/%s (%s) on %s", + uuid_utoa (pargfid), bname, + uuid_utoa_r (replies[i].poststat.ia_gfid, g), + priv->children[i]->name); + ret |= syncop_rmdir (priv->children[i], &loc, 1); + break; + default: + gf_log (this->name, GF_LOG_WARNING, + "expunging file %s/%s (%s) on %s", + uuid_utoa (pargfid), bname, + uuid_utoa_r (replies[i].poststat.ia_gfid, g), + priv->children[i]->name); + ret |= syncop_unlink (priv->children[i], &loc); + break; + } + } + + loc_wipe (&loc); + + return ret; + +} + + +int +__afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, int source, + unsigned char *locked_on, struct afr_reply *replies) +{ + int i = 0; + afr_private_t *priv = NULL; + uuid_t gfid = {0, }; + int gfid_idx = -1; + gf_boolean_t source_is_empty = _gf_true; + gf_boolean_t need_heal = _gf_false; + int first_idx = -1; + char g1[64],g2[64]; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (first_idx == -1) { + first_idx = i; + continue; + } + + if (replies[i].op_ret != replies[first_idx].op_ret) + need_heal = _gf_true; + + if (uuid_compare (replies[i].poststat.ia_gfid, + replies[first_idx].poststat.ia_gfid)) + need_heal = _gf_true; + } + + if (!need_heal) + return 0; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (!replies[i].op_ret && (source == -1 || sources[i])) { + source_is_empty = _gf_false; + break; + } + } + + if (source_is_empty) { + return __afr_selfheal_name_expunge (frame, this, parent, pargfid, + bname, inode, replies); + } + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (uuid_is_null (replies[i].poststat.ia_gfid)) + continue; + + if (uuid_is_null (gfid)) { + uuid_copy (gfid, replies[i].poststat.ia_gfid); + gfid_idx = i; + continue; + } + + if (sources[i] || source == -1) { + if (gfid_idx != -1 && + (sources[gfid_idx] || source == -1) && + uuid_compare (gfid, replies[i].poststat.ia_gfid)) { + gf_log (this->name, GF_LOG_WARNING, + "GFID mismatch for <gfid:%s>/%s " + "%s on %s and %s on %s", + uuid_utoa (pargfid), bname, + uuid_utoa_r (replies[i].poststat.ia_gfid, g1), + priv->children[i]->name, + uuid_utoa_r (replies[gfid_idx].poststat.ia_gfid, g2), + priv->children[gfid_idx]->name); + return -1; + } + + uuid_copy (gfid, replies[i].poststat.ia_gfid); + gfid_idx = i; + continue; + } + } + + if (gfid_idx == -1) + return -1; + + __afr_selfheal_assign_gfid (frame, this, parent, pargfid, bname, inode, + replies, gfid_idx); + + return __afr_selfheal_name_impunge (frame, this, parent, pargfid, + bname, inode, replies, gfid_idx); +} + + +int +__afr_selfheal_name_finalize_source (xlator_t *this, unsigned char *sources, + unsigned char *sinks, unsigned char *locked_on, + struct afr_reply *replies) +{ + int i = 0; + afr_private_t *priv = NULL; + int source = -1; + int locked_count = 0; + int sources_count = 0; + int sinks_count = 0; + + priv = this->private; + + locked_count = AFR_COUNT (locked_on, priv->child_count); + sources_count = AFR_COUNT (sources, priv->child_count); + sinks_count = AFR_COUNT (sinks, priv->child_count); + + if (locked_count == sinks_count || !sources_count) { + return -1; + } + + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source = i; + break; + } + } + + return source; +} + + +int +__afr_selfheal_name_prepare (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, struct afr_reply *replies, + int *source_p) +{ + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; + + ret = afr_selfheal_unlocked_discover (frame, parent, pargfid, replies); + if (ret) + return ret; + + ret = afr_selfheal_find_direction (frame, this, replies, + AFR_ENTRY_TRANSACTION, + locked_on, sources, sinks); + if (ret) + return ret; + + source = __afr_selfheal_name_finalize_source (this, sources, sinks, + locked_on, replies); + if (source < 0) { + /* If source is < 0 (typically split-brain), we perform a + conservative merge of entries rather than erroring out */ + } + *source_p = source; + + for (i = 0; i < priv->child_count; i++) + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + healed_sinks[i] = sinks[i] && locked_on[i]; + + return ret; +} + + +int +afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname) +{ + afr_private_t *priv = NULL; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *healed_sinks = NULL; + unsigned char *locked_on = NULL; + int source = -1; + struct afr_reply *replies = NULL; + int ret = -1; + inode_t *inode = NULL; + + priv = this->private; + + locked_on = alloca0 (priv->child_count); + sources = alloca0 (priv->child_count); + sinks = alloca0 (priv->child_count); + healed_sinks = alloca0 (priv->child_count); + + replies = alloca0 (priv->child_count * sizeof(*replies)); + + ret = afr_selfheal_entrylk (frame, this, parent, this->name, bname, + locked_on); + { + if (ret < 2) { + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_name_prepare (frame, this, parent, pargfid, + locked_on, sources, sinks, + healed_sinks, replies, + &source); + if (ret) + goto unlock; + + inode = afr_selfheal_unlocked_lookup_on (frame, parent, bname, + replies, locked_on); + if (!inode) { + ret = -ENOMEM; + goto unlock; + } + + ret = __afr_selfheal_name_do (frame, this, parent, pargfid, bname, + inode, sources, sinks, healed_sinks, + source, locked_on, replies); + } +unlock: + afr_selfheal_unentrylk (frame, this, parent, this->name, bname, + locked_on); + if (inode) + inode_unref (inode); + + return ret; +} + + +int +afr_selfheal_name_unlocked_inspect (call_frame_t *frame, xlator_t *this, + inode_t *parent, uuid_t pargfid, + const char *bname, gf_boolean_t *need_heal) +{ + afr_private_t *priv = NULL; + int i = 0; + struct afr_reply *replies = NULL; + inode_t *inode = NULL; + int first_idx = -1; + + priv = this->private; + + replies = alloca0 (sizeof (*replies) * priv->child_count); + + inode = afr_selfheal_unlocked_lookup_on (frame, parent, bname, + replies, priv->child_up); + if (!inode) + return -ENOMEM; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (first_idx == -1) { + first_idx = i; + continue; + } + + if (replies[i].op_ret != replies[first_idx].op_ret) + *need_heal = _gf_true; + + if (uuid_compare (replies[i].poststat.ia_gfid, + replies[first_idx].poststat.ia_gfid)) + *need_heal = _gf_true; + } + + if (inode) + inode_unref (inode); + return 0; +} + +int +afr_selfheal_name (xlator_t *this, uuid_t pargfid, const char *bname) +{ + inode_t *parent = NULL; + call_frame_t *frame = NULL; + int ret = -1; + gf_boolean_t need_heal = _gf_false; + + parent = afr_inode_find (this, pargfid); + if (!parent) + goto out; + + frame = afr_frame_create (this); + if (!frame) + goto out; + + ret = afr_selfheal_name_unlocked_inspect (frame, this, parent, pargfid, + bname, &need_heal); + if (ret) + goto out; + + if (need_heal) + afr_selfheal_name_do (frame, this, parent, pargfid, bname); +out: + if (parent) + inode_unref (parent); + if (frame) + AFR_STACK_DESTROY (frame); + + return ret; +} diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 7c9bc81119c..a1b972ac35d 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,36 +8,160 @@ cases as published by the Free Software Foundation. */ -#ifndef __AFR_SELF_HEAL_H__ -#define __AFR_SELF_HEAL_H__ -#include <sys/stat.h> +#ifndef _AFR_SELFHEAL_H +#define _AFR_SELFHEAL_H -#define FILETYPE_DIFFERS(buf1,buf2) ((buf1)->ia_type != (buf2)->ia_type) -#define PERMISSION_DIFFERS(buf1,buf2) (st_mode_from_ia ((buf1)->ia_prot, (buf1)->ia_type) != st_mode_from_ia ((buf2)->ia_prot, (buf2)->ia_type)) -#define OWNERSHIP_DIFFERS(buf1,buf2) (((buf1)->ia_uid != (buf2)->ia_uid) || ((buf1)->ia_gid != (buf2)->ia_gid)) -#define SIZE_DIFFERS(buf1,buf2) ((buf1)->ia_size != (buf2)->ia_size) -#define SIZE_GREATER(buf1,buf2) ((buf1)->ia_size > (buf2)->ia_size) +/* Perform fop on all UP subvolumes and wait for all callbacks to return */ + +#define AFR_ONALL(frame, rfn, fop, args ...) do { \ + afr_local_t *__local = frame->local; \ + afr_private_t *__priv = frame->this->private; \ + int __i = 0, __count = 0; \ + \ + afr_replies_wipe (__local, __priv); \ + \ + for (__i = 0; __i < __priv->child_count; __i++) { \ + if (!__priv->child_up[__i]) continue; \ + STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \ + __priv->children[__i], \ + __priv->children[__i]->fops->fop, args); \ + __count++; \ + } \ + syncbarrier_wait (&__local->barrier, __count); \ + } while (0) + + +/* Perform fop on all subvolumes represented by list[] array and wait + for all callbacks to return */ + +#define AFR_ONLIST(list, frame, rfn, fop, args ...) do { \ + afr_local_t *__local = frame->local; \ + afr_private_t *__priv = frame->this->private; \ + int __i = 0, __count = 0; \ + \ + afr_replies_wipe (__local, __priv); \ + \ + for (__i = 0; __i < __priv->child_count; __i++) { \ + if (!list[__i]) continue; \ + STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \ + __priv->children[__i], \ + __priv->children[__i]->fops->fop, args); \ + __count++; \ + } \ + syncbarrier_wait (&__local->barrier, __count); \ + } while (0) + + +#define AFR_SEQ(frame, rfn, fop, args ...) do { \ + afr_local_t *__local = frame->local; \ + afr_private_t *__priv = frame->this->private; \ + int __i = 0; \ + \ + afr_replies_wipe (__local, __priv); \ + \ + for (__i = 0; __i < __priv->child_count; __i++) { \ + if (!__priv->child_up[__i]) continue; \ + STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \ + __priv->children[__i], \ + __priv->children[__i]->fops->fop, args); \ + syncbarrier_wait (&__local->barrier, 1); \ + } \ + } while (0) + + +#define ALLOC_MATRIX(n, type) ({type **__ptr = NULL; \ + int __i; \ + __ptr = alloca0 (n * sizeof(type *)); \ + for (__i = 0; __i < n; __i++) __ptr[__i] = alloca0 (n * sizeof(type)); \ + __ptr;}) + + +#define IA_EQUAL(f,s,field) (memcmp (&(f.ia_##field), &(s.ia_##field), sizeof (s.ia_##field)) == 0) + + +int +afr_selfheal (xlator_t *this, uuid_t gfid); + +int +afr_selfheal_name (xlator_t *this, uuid_t gfid, const char *name); + +int +afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode); + +int +afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode); int -afr_self_heal_entry (call_frame_t *frame, xlator_t *this); +afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode); + + +int +afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on); + +int +afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on); int -afr_self_heal_data (call_frame_t *frame, xlator_t *this); +afr_selfheal_uninodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + const unsigned char *locked_on); int -afr_self_heal_metadata (call_frame_t *frame, xlator_t *this); +afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on); int -afr_self_heal_get_source (xlator_t *this, afr_local_t *local, dict_t **xattr); +afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on); + +int +afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on); + +int +afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies); + +inode_t * +afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent, + const char *name, struct afr_reply *replies, + unsigned char *lookup_on); int -afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode); +afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + afr_transaction_type type, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks); int -afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, - dict_t **xattr, - afr_transaction_type txn_type, - uuid_t gfid); -#endif /* __AFR_SELF_HEAL_H__ */ +afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies, + afr_transaction_type type, int *dirty, int **matrix); + +int +afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, afr_transaction_type type, + struct afr_reply *replies, unsigned char *locked_on); + +int +afr_selfheal_recreate_entry (call_frame_t *frame, xlator_t *this, int dst, + int source, inode_t *dir, const char *name, + inode_t *inode, struct afr_reply *replies); + +int +afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode, + int subvol, dict_t *xattr); + +call_frame_t * +afr_frame_create (xlator_t *this); + +inode_t * +afr_inode_find (xlator_t *this, uuid_t gfid); + +#endif /* !_AFR_SELFHEAL_H */ diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index 9e5c1b3e79f..4bfe909bcb9 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,1828 +8,1249 @@ cases as published by the Free Software Foundation. */ + #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" #endif + #include "afr.h" -#include "syncop.h" +#include "afr-self-heal.h" #include "afr-self-heald.h" -#include "afr-self-heal-common.h" #include "protocol-common.h" -#include "event-history.h" - -typedef enum { - STOP_CRAWL_ON_SINGLE_SUBVOL = 1, - STOP_INDEX_CRAWL_ON_PENDING_FULL_CRAWL = 2 -} afr_crawl_flags_t; - -typedef enum { - HEAL = 1, - INFO, - STATISTICS_TO_BE_HEALED, -} shd_crawl_op; - -typedef struct shd_dump { - dict_t *dict; - xlator_t *this; - int child; -} shd_dump_t; - -typedef struct shd_event_ { - int child; - char *path; -} shd_event_t; - -typedef struct shd_pos_ { - int child; - xlator_t *this; - afr_child_pos_t pos; -} shd_pos_t; - -typedef int -(*afr_crawl_done_cbk_t) (int ret, call_frame_t *sync_frame, void *crawl_data); -void -afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl, - process_entry_cbk_t process_entry, void *op_data, - gf_boolean_t exclusive, int crawl_flags, - afr_crawl_done_cbk_t crawl_done); +#define SHD_INODE_LRU_LIMIT 2048 +#define AFR_EH_HEALED_LIMIT 1024 +#define AFR_EH_HEAL_FAIL_LIMIT 1024 +#define AFR_EH_SPLIT_BRAIN_LIMIT 1024 +#define AFR_STATISTICS_HISTORY_SIZE 50 -static int -_crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data); -/* For calling straight through (e.g. already in a synctask). */ -int -afr_find_child_position (xlator_t *this, int child, afr_child_pos_t *pos); +#define ASSERT_LOCAL(this, healer) \ + if (!afr_shd_is_subvol_local(this, healer->subvol)) { \ + healer->local = _gf_false; \ + if (safe_break (healer)) { \ + break; \ + } else { \ + continue; \ + } \ + } else { \ + healer->local = _gf_true; \ + } -/* For deferring through a new synctask. */ -int -afr_syncop_find_child_position (void *data); -static int -_loc_assign_gfid_path (loc_t *loc) -{ - int ret = -1; - char gfid_path[64] = {0}; - - if (loc->inode && !uuid_is_null (loc->inode->gfid)) { - ret = inode_path (loc->inode, NULL, (char**)&loc->path); - } else if (!uuid_is_null (loc->gfid)) { - snprintf (gfid_path, sizeof (gfid_path), "<gfid:%s>", - uuid_utoa (loc->gfid)); - loc->path = gf_strdup (gfid_path); - if (loc->path) - ret = 0; - } - return ret; -} +#define NTH_INDEX_HEALER(this, n) &((((afr_private_t *)this->private))->shd.index_healers[n]) +#define NTH_FULL_HEALER(this, n) &((((afr_private_t *)this->private))->shd.full_healers[n]) -void -_destroy_crawl_event_data (void *data) -{ - shd_crawl_event_t *crawl_event = NULL; +int afr_shd_gfid_to_path (xlator_t *this, xlator_t *subvol, uuid_t gfid, char **path_p); - if (!data) - goto out; +char * +afr_subvol_name (xlator_t *this, int subvol) +{ + afr_private_t *priv = NULL; - crawl_event = (shd_crawl_event_t *)data; - GF_FREE (crawl_event->start_time_str); - GF_FREE (crawl_event->end_time_str); + priv = this->private; + if (subvol < 0 || subvol > priv->child_count) + return NULL; -out: - return; + return priv->children[subvol]->name; } + void -_destroy_shd_event_data (void *data) +afr_destroy_crawl_event_data (void *data) { - shd_event_t *event = NULL; - if (!data) - goto out; - event = (shd_event_t*)data; - GF_FREE (event->path); -out: return; } + + void -shd_cleanup_event (void *event) +afr_destroy_shd_event_data (void *data) { - shd_event_t *shd_event = event; + shd_event_t *shd_event = data; + + if (!shd_event) + return; + GF_FREE (shd_event->path); - if (!shd_event) - goto out; - GF_FREE (shd_event->path); - GF_FREE (shd_event); -out: return; } -int -afr_get_local_child (afr_self_heald_t *shd, unsigned int child_count) -{ - int i = 0; - int ret = -1; - for (i = 0; i < child_count; i++) { - if (shd->pos[i] == AFR_POS_LOCAL) { - ret = i; - break; - } - } - return ret; -} -static int -_build_index_loc (xlator_t *this, loc_t *loc, char *name, loc_t *parent) +gf_boolean_t +afr_shd_is_subvol_local (xlator_t *this, int subvol) { - int ret = 0; + char *pathinfo = NULL; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int ret = 0; + gf_boolean_t is_local = _gf_false; + loc_t loc = {0, }; - uuid_copy (loc->pargfid, parent->inode->gfid); - loc->path = ""; - loc->name = name; - loc->parent = inode_ref (parent->inode); - if (!loc->parent) { - loc->path = NULL; - loc_wipe (loc); - ret = -1; - } - return ret; -} + priv = this->private; -int -_add_crawl_stats_to_dict (xlator_t *this, dict_t *output, int child, - shd_crawl_event_t *shd_event, struct timeval *tv) -{ - int ret = 0; - uint64_t count = 0; - char key[256] = {0}; - int xl_id = 0; - uint64_t healed_count = 0; - uint64_t split_brain_count = 0; - uint64_t heal_failed_count = 0; - char *start_time_str = NULL; - char *end_time_str = NULL; - char *crawl_type = NULL; - int progress = -1; + loc.inode = this->itable->root; + uuid_copy (loc.gfid, loc.inode->gfid); - healed_count = shd_event->healed_count; - split_brain_count = shd_event->split_brain_count; - heal_failed_count = shd_event->heal_failed_count; - start_time_str = shd_event->start_time_str; - end_time_str = shd_event->end_time_str; - crawl_type = shd_event->crawl_type; + ret = syncop_getxattr (priv->children[subvol], &loc, &xattr, + GF_XATTR_PATHINFO_KEY); + if (ret) + return _gf_false; + if (!xattr) + return _gf_false; - if (!start_time_str) { - ret = -1; - goto out; - } + ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &pathinfo); + if (ret) + return _gf_false; + afr_local_pathinfo (pathinfo, &is_local); - ret = dict_get_int32 (output, this->name, &xl_id); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "xl does not have id"); - goto out; - } + gf_log (this->name, GF_LOG_DEBUG, "subvol %s is %slocal", + priv->children[subvol]->name, is_local? "" : "not "); - snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child); - ret = dict_get_uint64 (output, key, &count); + return is_local; +} - snprintf (key, sizeof (key), "statistics_healed_cnt-%d-%d-%"PRIu64, - xl_id, child, count); - ret = dict_set_uint64(output, key, healed_count); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" - "healed_count to outout"); - goto out; - } - snprintf (key, sizeof (key), "statistics_sb_cnt-%d-%d-%"PRIu64, - xl_id, child, count); - ret = dict_set_uint64 (output, key, split_brain_count); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" - "split_brain_count to outout"); - goto out; - } - snprintf (key, sizeof (key), "statistics_crawl_type-%d-%d-%"PRIu64, - xl_id, child, count); - ret = dict_set_dynstr (output, key, gf_strdup (crawl_type)); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" - "crawl_type to output"); - goto out; - } - snprintf (key, sizeof (key), "statistics_heal_failed_cnt-%d-%d-%"PRIu64, - xl_id, child, count); - ret = dict_set_uint64 (output, key, heal_failed_count); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" - "healed_failed_count to outout"); - goto out; - } - snprintf (key, sizeof (key), "statistics_strt_time-%d-%d-%"PRIu64, - xl_id, child, count); - ret = dict_set_dynstr (output, key, gf_strdup(start_time_str)); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" - "crawl_start_time to outout"); - goto out; - } +int +__afr_shd_healer_wait (struct subvol_healer *healer) +{ + afr_private_t *priv = NULL; + struct timespec wait_till = {0, }; + int ret = 0; - snprintf (key, sizeof (key), "statistics_end_time-%d-%d-%"PRIu64, - xl_id, child, count); + priv = healer->this->private; - if (!end_time_str) - end_time_str = "Could not determine the end time"; - ret = dict_set_dynstr (output, key, gf_strdup(end_time_str)); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" - "crawl_end_time to outout"); - goto out; - } - snprintf (key, sizeof (key), "statistics_inprogress-%d-%d-%"PRIu64, - xl_id, child, count); +disabled_loop: + wait_till.tv_sec = time (NULL) + 60; - if (shd_event->crawl_inprogress == _gf_true) - progress = 1; - else - progress = 0; + while (!healer->rerun) { + ret = pthread_cond_timedwait (&healer->cond, + &healer->mutex, + &wait_till); + if (ret == ETIMEDOUT) + break; + } - ret = dict_set_int32 (output, key, progress); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" - "inprogress to outout"); - goto out; - } + ret = healer->rerun; + healer->rerun = 0; - snprintf (key, sizeof (key), "statistics-%d-%d-count",xl_id, child); - ret = dict_set_uint64 (output, key, count + 1); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Could not increment the " - "counter."); - goto out; - } -out: - return ret; + if (!priv->shd.enabled) + goto disabled_loop; + + return ret; } + int -_add_path_to_dict (xlator_t *this, dict_t *output, int child, char *path, - struct timeval *tv, gf_boolean_t dyn) +afr_shd_healer_wait (struct subvol_healer *healer) { - //subkey not used for now - int ret = -1; - uint64_t count = 0; - char key[256] = {0}; - int xl_id = 0; + int ret = 0; - ret = dict_get_int32 (output, this->name, &xl_id); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "xl does not have id"); - goto out; - } + pthread_mutex_lock (&healer->mutex); + { + ret = __afr_shd_healer_wait (healer); + } + pthread_mutex_unlock (&healer->mutex); - snprintf (key, sizeof (key), "%d-%d-count", xl_id, child); - ret = dict_get_uint64 (output, key, &count); + return ret; +} - snprintf (key, sizeof (key), "%d-%d-%"PRIu64, xl_id, child, count); - if (dyn) - ret = dict_set_dynstr (output, key, path); - else - ret = dict_set_str (output, key, path); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "%s: Could not add to output", - path); - goto out; - } - if (!tv) - goto inc_count; - snprintf (key, sizeof (key), "%d-%d-%"PRIu64"-time", xl_id, - child, count); - ret = dict_set_uint32 (output, key, tv->tv_sec); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "%s: Could not set time", - path); - goto out; - } +gf_boolean_t +safe_break (struct subvol_healer *healer) +{ + gf_boolean_t ret = _gf_false; -inc_count: - snprintf (key, sizeof (key), "%d-%d-count", xl_id, child); - ret = dict_set_uint64 (output, key, count + 1); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Could not increment count"); - goto out; - } - ret = 0; -out: - return ret; -} + pthread_mutex_lock (&healer->mutex); + { + if (healer->rerun) + goto unlock; -int -_get_path_from_gfid_loc (xlator_t *this, xlator_t *readdir_xl, loc_t *child, - char **fpath, gf_boolean_t *missing) -{ - dict_t *xattr = NULL; - char *path = NULL; - int ret = -1; + healer->running = _gf_false; + ret = _gf_true; + } +unlock: + pthread_mutex_unlock (&healer->mutex); - ret = syncop_getxattr (readdir_xl, child, &xattr, GFID_TO_PATH_KEY); - if (ret < 0) { - if ((-ret == ENOENT || -ret == ESTALE) && missing) - *missing = _gf_true; - ret = -1; - goto out; - } - ret = dict_get_str (xattr, GFID_TO_PATH_KEY, &path); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Failed to get path for " - "gfid %s", uuid_utoa (child->gfid)); - goto out; - } - path = gf_strdup (path); - if (!path) { - ret = -1; - goto out; - } - ret = 0; -out: - if (!ret) - *fpath = path; - if (xattr) - dict_unref (xattr); - return ret; + return ret; } -int -_add_event_to_dict (circular_buffer_t *cb, void *data) -{ - int ret = 0; - shd_dump_t *dump_data = NULL; - shd_event_t *shd_event = NULL; - dump_data = data; - shd_event = cb->data; - if (shd_event->child != dump_data->child) - goto out; - ret = _add_path_to_dict (dump_data->this, dump_data->dict, - dump_data->child, shd_event->path, &cb->tv, - _gf_false); +inode_t * +afr_shd_inode_find (xlator_t *this, xlator_t *subvol, uuid_t gfid) +{ + inode_t *inode = NULL; + int ret = 0; + loc_t loc = {0, }; + struct iatt iatt = {0, }; + + inode = inode_find (this->itable, gfid); + if (inode) + goto out; + + loc.inode = inode_new (this->itable); + if (!loc.inode) + goto out; + uuid_copy (loc.gfid, gfid); + + ret = syncop_lookup (subvol, &loc, NULL, &iatt, NULL, NULL); + if (ret < 0) + goto out; + + inode = inode_link (loc.inode, NULL, NULL, &iatt); + if (inode) + inode_lookup (inode); out: - return ret; + loc_wipe (&loc); + return inode; } -int -_add_crawl_event_statistics_to_dict (circular_buffer_t *cb, void *data) -{ - int ret = 0; - shd_dump_t *dump_data = NULL; - shd_crawl_event_t *shd_event = NULL; - - dump_data = data; - shd_event = cb->data; - ret = _add_crawl_stats_to_dict (dump_data->this, dump_data->dict, - dump_data->child, shd_event, &cb->tv); - return ret; -} -int -_add_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict, int child) +fd_t * +afr_shd_index_opendir (xlator_t *this, int child) { - shd_dump_t dump_data = {0}; - - dump_data.this = this; - dump_data.dict = dict; - dump_data.child = child; - eh_dump (eh, &dump_data, _add_event_to_dict); - return 0; + fd_t *fd = NULL; + afr_private_t *priv = NULL; + xlator_t *subvol = NULL; + loc_t rootloc = {0, }; + inode_t *inode = NULL; + int ret = 0; + dict_t *xattr = NULL; + void *index_gfid = NULL; + + priv = this->private; + subvol = priv->children[child]; + + rootloc.inode = inode_ref (this->itable->root); + uuid_copy (rootloc.gfid, rootloc.inode->gfid); + + ret = syncop_getxattr (subvol, &rootloc, &xattr, + GF_XATTROP_INDEX_GFID); + if (ret || !xattr) { + errno = -ret; + goto out; + } + + ret = dict_get_ptr (xattr, GF_XATTROP_INDEX_GFID, &index_gfid); + if (ret) + goto out; + + gf_log (this->name, GF_LOG_DEBUG, "index-dir gfid for %s: %s", + subvol->name, uuid_utoa (index_gfid)); + + inode = afr_shd_inode_find (this, subvol, index_gfid); + if (!inode) + goto out; + fd = fd_anonymous (inode); +out: + loc_wipe (&rootloc); + if (xattr) + dict_unref (xattr); + return fd; } int -_add_statistics_to_dict (xlator_t *this, dict_t *dict, int child) +afr_shd_index_purge (xlator_t *subvol, inode_t *inode, char *name) { - shd_dump_t dump_data = {0}; - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; + loc_t loc = {0, }; + int ret = 0; - priv = this->private; - shd = &priv->shd; + loc.parent = inode_ref (inode); + loc.name = name; - dump_data.this = this; - dump_data.dict = dict; - dump_data.child = child; - eh_dump (shd->statistics[child], &dump_data, - _add_crawl_event_statistics_to_dict); - return 0; + ret = syncop_unlink (subvol, &loc); + loc_wipe (&loc); + return ret; } -void -_remove_stale_index (xlator_t *this, xlator_t *readdir_xl, - loc_t *parent, char *fname) -{ - int ret = 0; - loc_t index_loc = {0}; - - ret = _build_index_loc (this, &index_loc, fname, parent); - if (ret) - goto out; - gf_log (this->name, GF_LOG_DEBUG, "Removing stale index " - "for %s on %s", index_loc.name, readdir_xl->name); - ret = syncop_unlink (readdir_xl, &index_loc); - if((ret < 0) && (-ret != ENOENT)) { - gf_log(this->name, GF_LOG_ERROR, "%s: Failed to remove index " - "on %s - %s",index_loc.name, readdir_xl->name, - strerror (-ret)); - } - index_loc.path = NULL; - loc_wipe (&index_loc); -out: - return; -} int -_count_hard_links_under_base_indices_dir (xlator_t *this, - afr_crawl_data_t *crawl_data, - gf_dirent_t *entry, loc_t *childloc, - loc_t *parentloc, struct iatt *iattr) +afr_shd_selfheal_name (struct subvol_healer *healer, int child, uuid_t parent, + const char *bname) { - xlator_t *readdir_xl = crawl_data->readdir_xl; - struct iatt parent = {0}; - int ret = 0; - dict_t *output = NULL; - int xl_id = 0; - char key[256] = {0}; - int child = -1; - uint64_t hardlinks = 0; - - output = crawl_data->op_data; - child = crawl_data->child; - - ret = syncop_lookup (readdir_xl, childloc, NULL, iattr, NULL, &parent); - if (ret) { - ret = -1; - goto out; - } - - ret = dict_get_int32 (output, this->name, &xl_id); - if (ret) - goto out; + int ret = -1; - snprintf (key, sizeof (key), "%d-%d-hardlinks", xl_id, child); - ret = dict_get_uint64 (output, key, &hardlinks); + ret = afr_selfheal_name (THIS, parent, bname); - /*Removing the count of base_entry under indices/base_indicies and - * entry under indices/xattrop */ - hardlinks = hardlinks + iattr->ia_nlink - 2; - ret = dict_set_uint64 (output, key, hardlinks); - if (ret) - goto out; - -out: - return ret; + return ret; } int -_add_summary_to_dict (xlator_t *this, afr_crawl_data_t *crawl_data, - gf_dirent_t *entry, - loc_t *childloc, loc_t *parentloc, struct iatt *iattr) +afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid) { - dict_t *output = NULL; - xlator_t *readdir_xl = NULL; - int ret = -1; - char *path = NULL; - gf_boolean_t missing = _gf_false; - char gfid_str[64] = {0}; + int ret = 0; + eh_t *eh = NULL; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + shd_event_t *shd_event = NULL; + char *path = NULL; + xlator_t *subvol = NULL; + xlator_t *this = NULL; + crawl_event_t *crawl_event = NULL; + + this = healer->this; + priv = this->private; + shd = &priv->shd; + crawl_event = &healer->crawl_event; + + subvol = priv->children[child]; + + ret = afr_selfheal (this, gfid); + + if (ret == -EIO) { + eh = shd->split_brain; + crawl_event->split_brain_count++; + } else if (ret < 0) { + eh = shd->heal_failed; + crawl_event->heal_failed_count++; + } else if (ret == 0) { + eh = shd->healed; + crawl_event->healed_count++; + } + + afr_shd_gfid_to_path (this, subvol, gfid, &path); + if (!path) + return ret; + + if (eh) { + shd_event = GF_CALLOC (1, sizeof(*shd_event), + gf_afr_mt_shd_event_t); + if (!shd_event) { + GF_FREE (path); + return ret; + } + + shd_event->child = child; + shd_event->path = path; + + if (eh_save_history (eh, shd_event) < 0) { + GF_FREE (shd_event); + GF_FREE (path); + } + } + return ret; +} - if (uuid_is_null (childloc->gfid)) - goto out; - output = crawl_data->op_data; - readdir_xl = crawl_data->readdir_xl; - - ret = _get_path_from_gfid_loc (this, readdir_xl, childloc, &path, - &missing); - if (ret == 0) { - ret = _add_path_to_dict (this, output, crawl_data->child, path, - NULL, _gf_true); - } else if (missing) { - _remove_stale_index (this, readdir_xl, parentloc, - uuid_utoa_r (childloc->gfid, gfid_str)); - } +void +afr_shd_sweep_prepare (struct subvol_healer *healer) +{ + crawl_event_t *event = NULL; -out: - if (ret && path) - GF_FREE (path); - return ret; + event = &healer->crawl_event; + + event->healed_count = 0; + event->split_brain_count = 0; + event->heal_failed_count = 0; + + time (&event->start_time); + event->end_time = 0; } + void -_crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child, - int32_t op_ret, int32_t op_errno, dict_t *xattr_rsp, - afr_crawl_data_t *crawl_data) +afr_shd_sweep_done (struct subvol_healer *healer) { - int ret = 0; - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - eh_t *eh = NULL; - char *path = NULL; - char gfid_str[64] = {0}; - shd_event_t *event = NULL; - int32_t sh_failed = 0; - gf_boolean_t split_brain = 0; - int32_t actual_sh_done = 0; - shd_crawl_event_t **shd_crawl_event = NULL; - - priv = this->private; - shd = &priv->shd; - if (crawl_data->crawl == INDEX) { - if ((op_ret < 0) && (op_errno == ENOENT)) { - _remove_stale_index (this, crawl_data->readdir_xl, - parent, uuid_utoa_r (child->gfid, - gfid_str)); - goto out; - } - ret = _get_path_from_gfid_loc (this, crawl_data->readdir_xl, - child, &path, NULL); - if (ret) - goto out; - } else { - path = gf_strdup (child->path); - if (!path) { - ret = -1; - goto out; - } - } + crawl_event_t *event = NULL; + crawl_event_t *history = NULL; + afr_self_heald_t *shd = NULL; - if (xattr_rsp) { - ret = dict_get_int32 (xattr_rsp, "sh-failed", &sh_failed); - ret = dict_get_int32 (xattr_rsp, "actual-sh-done", &actual_sh_done); - } + event = &healer->crawl_event; + shd = &(((afr_private_t *)healer->this->private)->shd); - shd_crawl_event = (shd_crawl_event_t**)(shd->crawl_events); - - split_brain = afr_is_split_brain (this, child->inode); - if ((op_ret < 0 && op_errno == EIO) || split_brain) { - eh = shd->split_brain; - shd_crawl_event[crawl_data->child]->split_brain_count += 1; - } else if ((op_ret < 0) || sh_failed) { - eh = shd->heal_failed; - shd_crawl_event[crawl_data->child]->heal_failed_count += 1; - } else if (actual_sh_done == 1) { - eh = shd->healed; - shd_crawl_event[crawl_data->child]->healed_count += 1; - } - ret = -1; + time (&event->end_time); + history = memdup (event, sizeof (*event)); + event->start_time = 0; - if (eh != NULL) { - event = GF_CALLOC (1, sizeof (*event), gf_afr_mt_shd_event_t); - if (!event) - goto out; - event->child = crawl_data->child; - event->path = path; + if (!history) + return; - ret = eh_save_history (eh, event); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "%s:Failed to save " - "to event history, (%d, %s)", path, op_ret, - strerror (op_errno)); + if (eh_save_history (shd->statistics[healer->subvol], history) < 0) + GF_FREE (history); +} - goto out; - } - } else { - gf_log (this->name, GF_LOG_DEBUG, "%s:Self heal already done ", - path); - } - ret = 0; -out: - if (ret && path) - GF_FREE (path); - return; +int +afr_shd_index_sweep (struct subvol_healer *healer) +{ + xlator_t *this = NULL; + int child = -1; + fd_t *fd = NULL; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; + off_t offset = 0; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + uuid_t gfid; + int ret = 0; + int count = 0; + + this = healer->this; + child = healer->subvol; + priv = this->private; + subvol = priv->children[child]; + + fd = afr_shd_index_opendir (this, child); + if (!fd) { + gf_log (this->name, GF_LOG_WARNING, + "unable to opendir index-dir on %s", subvol->name); + return -errno; + } + + INIT_LIST_HEAD (&entries.list); + + while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) { + if (ret > 0) + ret = 0; + list_for_each_entry (entry, &entries.list, list) { + offset = entry->d_off; + + if (!priv->shd.enabled) { + ret = -EBUSY; + break; + } + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + gf_log (this->name, GF_LOG_DEBUG, "got entry: %s", + entry->d_name); + + ret = uuid_parse (entry->d_name, gfid); + if (ret) + continue; + + ret = afr_shd_selfheal (healer, child, gfid); + if (ret == 0) + count++; + + if (ret == -ENOENT || ret == -ESTALE) { + afr_shd_index_purge (subvol, fd->inode, + entry->d_name); + ret = 0; + } + } + + gf_dirent_free (&entries); + if (ret) + break; + } + + if (fd) + fd_unref (fd); + if (!ret) + ret = count; + return ret; } + int -_link_inode_update_loc (xlator_t *this, loc_t *loc, struct iatt *iattr) +afr_shd_full_sweep (struct subvol_healer *healer, inode_t *inode) { - inode_t *link_inode = NULL; - int ret = -1; + fd_t *fd = NULL; + xlator_t *this = NULL; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; + off_t offset = 0; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + int ret = 0; + + this = healer->this; + priv = this->private; + subvol = priv->children[healer->subvol]; + + fd = fd_anonymous (inode); + if (!fd) + return -errno; + + INIT_LIST_HEAD (&entries.list); + + while ((ret = syncop_readdirp (subvol, fd, 131072, offset, 0, &entries))) { + if (ret < 0) + break; + + ret = gf_link_inodes_from_dirent (this, fd->inode, &entries); + if (ret) + break; + + list_for_each_entry (entry, &entries.list, list) { + offset = entry->d_off; + + if (!priv->shd.enabled) { + ret = -EBUSY; + break; + } + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + afr_shd_selfheal_name (healer, healer->subvol, + inode->gfid, entry->d_name); + + afr_shd_selfheal (healer, healer->subvol, + entry->d_stat.ia_gfid); + + if (entry->d_stat.ia_type == IA_IFDIR) { + ret = afr_shd_full_sweep (healer, entry->inode); + if (ret) + break; + } + } + + gf_dirent_free (&entries); + if (ret) + break; + } + + if (fd) + fd_unref (fd); + return ret; +} - link_inode = inode_link (loc->inode, NULL, NULL, iattr); - if (link_inode == NULL) { - gf_log (this->name, GF_LOG_ERROR, "inode link failed " - "on the inode (%s)", uuid_utoa (iattr->ia_gfid)); - goto out; - } - inode_unref (loc->inode); - loc->inode = link_inode; - ret = 0; -out: - return ret; + +void * +afr_shd_index_healer (void *data) +{ + struct subvol_healer *healer = NULL; + xlator_t *this = NULL; + int ret = 0; + + healer = data; + THIS = this = healer->this; + + for (;;) { + afr_shd_healer_wait (healer); + + ASSERT_LOCAL(this, healer); + + do { + gf_log (this->name, GF_LOG_DEBUG, + "starting index sweep on subvol %s", + afr_subvol_name (this, healer->subvol)); + + afr_shd_sweep_prepare (healer); + + ret = afr_shd_index_sweep (healer); + + afr_shd_sweep_done (healer); + /* + As long as at least one gfid was + healed, keep retrying. We may have + just healed a directory and thereby + created entries for other gfids which + could not be healed thus far. + */ + + gf_log (this->name, GF_LOG_DEBUG, + "finished index sweep on subvol %s", + afr_subvol_name (this, healer->subvol)); + /* + Give a pause before retrying to avoid a busy loop + in case the only entry in index is because of + an ongoing I/O. + */ + sleep (1); + } while (ret > 0); + } + + return NULL; } -int -_self_heal_entry (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *entry, - loc_t *child, loc_t *parent, struct iatt *iattr) + +void * +afr_shd_full_healer (void *data) { - struct iatt parentbuf = {0}; - int ret = 0; - dict_t *xattr_rsp = NULL; - dict_t *xattr_req = NULL; + struct subvol_healer *healer = NULL; + xlator_t *this = NULL; + int run = 0; - xattr_req = dict_new (); - if (!xattr_req) { - errno = ENOMEM; - ret = -1; - goto out; - } + healer = data; + THIS = this = healer->this; - ret = dict_set_int32 (xattr_req, "attempt-self-heal", 1); + for (;;) { + pthread_mutex_lock (&healer->mutex); + { + run = __afr_shd_healer_wait (healer); + if (!run) + healer->running = _gf_false; + } + pthread_mutex_unlock (&healer->mutex); - gf_log (this->name, GF_LOG_DEBUG, "lookup %s", child->path); + if (!run) + break; - ret = syncop_lookup (this, child, xattr_req, - iattr, &xattr_rsp, &parentbuf); - _crawl_post_sh_action (this, parent, child, ret, -ret, xattr_rsp, - crawl_data); - if (ret < 0) - ret = -1; - if (xattr_rsp) - dict_unref (xattr_rsp); - if (ret == 0) - ret = _link_inode_update_loc (this, child, iattr); + ASSERT_LOCAL(this, healer); -out: - if (xattr_req) - dict_unref(xattr_req); - return ret; -} + gf_log (this->name, GF_LOG_INFO, + "starting full sweep on subvol %s", + afr_subvol_name (this, healer->subvol)); -static int -afr_crawl_done (int ret, call_frame_t *sync_frame, void *data) -{ - GF_FREE (data); - STACK_DESTROY (sync_frame->root); - return 0; -} + afr_shd_sweep_prepare (healer); -int -_get_heal_op_flags (shd_crawl_op op, afr_crawl_type_t crawl) -{ - int crawl_flags = 0; + afr_shd_full_sweep (healer, this->itable->root); - if (HEAL == op) { - crawl_flags |= STOP_CRAWL_ON_SINGLE_SUBVOL; + afr_shd_sweep_done (healer); - if (crawl == INDEX) - crawl_flags |= STOP_INDEX_CRAWL_ON_PENDING_FULL_CRAWL; - } + gf_log (this->name, GF_LOG_INFO, + "finished full sweep on subvol %s", + afr_subvol_name (this, healer->subvol)); + } - return crawl_flags; + return NULL; } -void -_do_self_heal_on_subvol (xlator_t *this, int child, afr_crawl_type_t crawl) -{ - afr_start_crawl (this, child, crawl, _self_heal_entry, - NULL, _gf_true, _get_heal_op_flags (HEAL, crawl), - afr_crawl_done); -} -gf_boolean_t -_crawl_proceed (xlator_t *this, int child, int crawl_flags, char **reason) +int +afr_shd_healer_init (xlator_t *this, struct subvol_healer *healer) { - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - gf_boolean_t proceed = _gf_false; - char *msg = NULL; - - priv = this->private; - shd = &priv->shd; - if (!shd->enabled) { - msg = "Self-heal daemon is not enabled"; - gf_log (this->name, GF_LOG_DEBUG, "%s", msg); - goto out; - } + int ret = 0; - if (!priv->child_up[child]) { - gf_log (this->name, GF_LOG_DEBUG, "Stopping crawl for %s , " - "subvol went down", priv->children[child]->name); - msg = "Brick is Not connected"; - goto out; - } - - if (crawl_flags & STOP_CRAWL_ON_SINGLE_SUBVOL) { - if (afr_up_children_count (priv->child_up, - priv->child_count) < 2) { - gf_log (this->name, GF_LOG_DEBUG, "Stopping crawl as " - "< 2 children are up"); - msg = "< 2 bricks in replica are running"; - goto out; - } - } + ret = pthread_mutex_init (&healer->mutex, NULL); + if (ret) + goto out; - if (crawl_flags & STOP_INDEX_CRAWL_ON_PENDING_FULL_CRAWL) { - if (shd->pending[child] == FULL) { - gf_log (this->name, GF_LOG_INFO, "Stopping index " - "self-heal as Full self-heal is pending on %s", - priv->children[child]->name); - msg = "Full crawl is pending"; - goto out; - } - } + ret = pthread_cond_init (&healer->cond, NULL); + if (ret) + goto out; - proceed = _gf_true; + healer->this = this; + healer->running = _gf_false; + healer->rerun = _gf_false; + healer->local = _gf_false; out: - if (reason) - *reason = msg; - return proceed; + return ret; } -int -_do_crawl_op_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl, - shd_crawl_op op, dict_t *output) -{ - afr_private_t *priv = NULL; - char *status = NULL; - char *subkey = NULL; - char key[256] = {0}; - shd_pos_t pos_data = {0}; - int op_ret = -1; - int xl_id = -1; - int i = 0; - int ret = 0; - int crawl_flags = 0; - - priv = this->private; - crawl_flags = _get_heal_op_flags (op, crawl); - - if (output) { - ret = dict_get_int32 (output, this->name, &xl_id); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Invalid input, " - "translator-id is not available"); - goto out; - } - } - pos_data.this = this; - subkey = "status"; - for (i = 0; i < priv->child_count; i++) { - if (_crawl_proceed (this, i, crawl_flags, &status)) { - pos_data.child = i; - /* - * We're already in a synctask in this case, so we - * don't need to defer through a second (and in fact - * that can cause deadlock). Just call straight - * through instead. - */ - ret = afr_find_child_position(pos_data.this, - pos_data.child, - &pos_data.pos); - if (ret) { - status = "Not able to find brick location"; - } else if (pos_data.pos == AFR_POS_REMOTE) { - status = "brick is remote"; - } else { - op_ret = 0; - if (op == HEAL) { - status = "Started self-heal"; - _do_self_heal_on_subvol (this, i, - crawl); - } else if (output && (op == INFO)) { - status = ""; - afr_start_crawl (this, i, INDEX, - _add_summary_to_dict, - output, _gf_false, 0, - NULL); - } else if (output && - (op == STATISTICS_TO_BE_HEALED)) { - status = ""; - afr_start_crawl (this, i, - INDEX_TO_BE_HEALED, - _count_hard_links_under_base_indices_dir, - output, _gf_false, - 0, NULL); - } - } - if (output) { - snprintf (key, sizeof (key), "%d-%d-%s", xl_id, - i, subkey); - ret = dict_set_str (output, key, status); - } - if (!op_ret && (crawl == FULL)) - break; - } - if (output) { - snprintf (key, sizeof (key), "%d-%d-%s", xl_id, i, - subkey); - ret = dict_set_str (output, key, status); - } - } -out: - return op_ret; -} int -_do_self_heal_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl, - dict_t *output) +afr_shd_healer_spawn (xlator_t *this, struct subvol_healer *healer, + void *(threadfn)(void *)) { - return _do_crawl_op_on_local_subvols (this, crawl, HEAL, output); + int ret = 0; + + pthread_mutex_lock (&healer->mutex); + { + if (healer->running) { + pthread_cond_signal (&healer->cond); + } else { + ret = gf_thread_create (&healer->thread, NULL, + threadfn, healer); + if (ret) + goto unlock; + healer->running = 1; + } + + healer->rerun = 1; + } +unlock: + pthread_mutex_unlock (&healer->mutex); + + return ret; } + int -_get_index_summary_on_local_subvols (xlator_t *this, dict_t *output) +afr_shd_full_healer_spawn (xlator_t *this, int subvol) { - return _do_crawl_op_on_local_subvols (this, INDEX, INFO, output); + return afr_shd_healer_spawn (this, NTH_FULL_HEALER (this, subvol), + afr_shd_full_healer); } -void -afr_fill_completed_crawl_statistics_to_dict (xlator_t *this, dict_t *dict) -{ - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - int i = 0; - priv = this->private; - shd= &priv->shd; - for (i = 0; i < priv->child_count; i++) { - if (shd->pos[i] != AFR_POS_LOCAL) - continue; - _add_statistics_to_dict (this, dict, i); - } - - return ; -} -static void -reset_crawl_event (shd_crawl_event_t *crawl_event) +int +afr_shd_index_healer_spawn (xlator_t *this, int subvol) { - crawl_event->healed_count = 0; - crawl_event->split_brain_count = 0; - crawl_event->heal_failed_count = 0; - GF_FREE (crawl_event->start_time_str); - crawl_event->start_time_str = NULL; - crawl_event->end_time_str = NULL; - crawl_event->crawl_type = NULL; - crawl_event->crawl_inprogress = _gf_false; - return; + return afr_shd_healer_spawn (this, NTH_INDEX_HEALER (this, subvol), + afr_shd_index_healer); } -static void -afr_copy_crawl_event_struct (shd_crawl_event_t *src, shd_crawl_event_t *dst) -{ - dst->healed_count = src->healed_count; - dst->split_brain_count = src->split_brain_count; - dst->heal_failed_count = src->heal_failed_count; - dst->start_time_str = gf_strdup (src->start_time_str); - dst->end_time_str = "Crawl is already in progress"; - dst->crawl_type = src->crawl_type; - dst->crawl_inprogress = _gf_true; - return; -} -static int -afr_fill_crawl_statistics_of_running_crawl(xlator_t *this, dict_t *dict) +int +afr_shd_dict_add_crawl_event (xlator_t *this, dict_t *output, + crawl_event_t *crawl_event) { - shd_crawl_event_t *evnt = NULL; - int ret = 0; - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - int i = 0; - priv = this->private; - shd = &priv->shd; - - evnt = GF_CALLOC (1, sizeof (shd_crawl_event_t), - gf_afr_mt_shd_crawl_event_t); - if (!evnt) { - ret = -1; - goto out; - } - LOCK (&priv->lock); - { - for (i = 0; i < priv->child_count; i++) { - if (shd->pos[i] != AFR_POS_LOCAL) - continue; - - reset_crawl_event (evnt); - - if (!shd->crawl_events[i]) { - continue; - } - - afr_copy_crawl_event_struct (shd->crawl_events[i], - evnt); - _add_crawl_stats_to_dict (this, dict, i, evnt, NULL); + int ret = 0; + uint64_t count = 0; + char key[256] = {0}; + int xl_id = 0; + uint64_t healed_count = 0; + uint64_t split_brain_count = 0; + uint64_t heal_failed_count = 0; + char *start_time_str = 0; + char *end_time_str = NULL; + char *crawl_type = NULL; + int progress = -1; + int child = -1; - } - } - UNLOCK (&priv->lock); - reset_crawl_event (evnt); - GF_FREE (evnt); + child = crawl_event->child; + healed_count = crawl_event->healed_count; + split_brain_count = crawl_event->split_brain_count; + heal_failed_count = crawl_event->heal_failed_count; + crawl_type = crawl_event->crawl_type; -out: - return ret; -} + if (!crawl_event->start_time) + goto out; -static int -_add_local_subvols_crawl_statistics_to_dict (xlator_t *this, dict_t *dict) -{ - int ret = 0; - afr_fill_completed_crawl_statistics_to_dict (this, dict); - ret = afr_fill_crawl_statistics_of_running_crawl (this, dict); - return ret; -} -int -_add_local_subvols_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict) -{ - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - int i = 0; + start_time_str = gf_strdup (ctime (&crawl_event->start_time)); - priv = this->private; - shd = &priv->shd; + if (crawl_event->end_time) + end_time_str = gf_strdup (ctime (&crawl_event->end_time)); - for (i = 0; i < priv->child_count; i++) { - if (shd->pos[i] != AFR_POS_LOCAL) - continue; - _add_eh_to_dict (this, eh, dict, i); + ret = dict_get_int32 (output, this->name, &xl_id); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "xl does not have id"); + goto out; } - return 0; -} -int -afr_xl_op (xlator_t *this, dict_t *input, dict_t *output) -{ - gf_xl_afr_op_t op = GF_AFR_OP_INVALID; - int ret = 0; - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - int xl_id = 0; + snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child); + ret = dict_get_uint64 (output, key, &count); - priv = this->private; - shd = &priv->shd; - ret = dict_get_int32 (input, "xl-op", (int32_t*)&op); - if (ret) - goto out; - ret = dict_get_int32 (input, this->name, &xl_id); - if (ret) + snprintf (key, sizeof (key), "statistics_healed_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64(output, key, healed_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_healed_count to outout"); goto out; - ret = dict_set_int32 (output, this->name, xl_id); - if (ret) + } + + snprintf (key, sizeof (key), "statistics_sb_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64 (output, key, split_brain_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_split_brain_count to outout"); goto out; - switch (op) { - case GF_AFR_OP_HEAL_INDEX: - ret = _do_self_heal_on_local_subvols (this, INDEX, output); - break; - case GF_AFR_OP_HEAL_FULL: - ret = _do_self_heal_on_local_subvols (this, FULL, output); - break; - case GF_AFR_OP_INDEX_SUMMARY: - (void)_get_index_summary_on_local_subvols (this, output); - ret = 0; - break; - case GF_AFR_OP_HEALED_FILES: - ret = _add_local_subvols_eh_to_dict (this, shd->healed, output); - break; - case GF_AFR_OP_HEAL_FAILED_FILES: - ret = _add_local_subvols_eh_to_dict (this, shd->heal_failed, - output); - break; - case GF_AFR_OP_SPLIT_BRAIN_FILES: - ret = _add_local_subvols_eh_to_dict (this, shd->split_brain, - output); - break; - case GF_AFR_OP_STATISTICS: - ret = _add_local_subvols_crawl_statistics_to_dict (this, output); - break; - case GF_AFR_OP_STATISTICS_HEAL_COUNT: - case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA: - ret = _do_crawl_op_on_local_subvols (this, INDEX_TO_BE_HEALED, - STATISTICS_TO_BE_HEALED, - output); - break; - default: - gf_log (this->name, GF_LOG_ERROR, "Unknown set op %d", op); - break; } -out: - dict_del (output, this->name); - return ret; -} -void -afr_poll_self_heal (void *data) -{ - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - struct timespec timeout = {0}; - xlator_t *this = NULL; - long child = (long)data; - gf_timer_t *old_timer = NULL; - gf_timer_t *new_timer = NULL; - shd_pos_t pos_data = {0}; - int ret = 0; - - this = THIS; - priv = this->private; - shd = &priv->shd; - - if (shd->pos[child] == AFR_POS_UNKNOWN) { - pos_data.this = this; - pos_data.child = child; - ret = synctask_new (this->ctx->env, - afr_syncop_find_child_position, - NULL, NULL, &pos_data); - if (!ret) - shd->pos[child] = pos_data.pos; - } - if (shd->enabled && (shd->pos[child] == AFR_POS_LOCAL)) - _do_self_heal_on_subvol (this, child, INDEX); - timeout.tv_sec = shd->timeout; - timeout.tv_nsec = 0; - //notify and previous timer should be synchronized. - LOCK (&priv->lock); - { - old_timer = shd->timer[child]; - if (shd->pos[child] == AFR_POS_REMOTE) - goto unlock; - shd->timer[child] = gf_timer_call_after (this->ctx, timeout, - afr_poll_self_heal, - data); - new_timer = shd->timer[child]; - } -unlock: - UNLOCK (&priv->lock); - - if (old_timer) - gf_timer_call_cancel (this->ctx, old_timer); - if (!new_timer && (shd->pos[child] != AFR_POS_REMOTE)) { - gf_log (this->name, GF_LOG_WARNING, - "Could not create self-heal polling timer for %s", - priv->children[child]->name); + snprintf (key, sizeof (key), "statistics_crawl_type-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_str (output, key, crawl_type); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_crawl_type to output"); + goto out; } - return; -} - -static int -afr_handle_child_up (int ret, call_frame_t *sync_frame, void *data) -{ - afr_self_heald_t *shd = NULL; - shd_pos_t *pos_data = data; - afr_private_t *priv = NULL; - if (ret) + snprintf (key, sizeof (key), "statistics_heal_failed_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64 (output, key, heal_failed_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_healed_failed_count to outout"); goto out; + } - priv = pos_data->this->private; - shd = &priv->shd; - shd->pos[pos_data->child] = pos_data->pos; - if (pos_data->pos != AFR_POS_REMOTE) - afr_poll_self_heal ((void*)(long)pos_data->child); - _do_self_heal_on_local_subvols (THIS, INDEX, NULL); -out: - GF_FREE (data); - return 0; -} - -void -afr_proactive_self_heal (void *data) -{ - xlator_t *this = NULL; - long child = (long)data; - shd_pos_t *pos_data = NULL; - int ret = 0; + snprintf (key, sizeof (key), "statistics_strt_time-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_dynstr (output, key, start_time_str); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_crawl_start_time to outout"); + goto out; + } else { + start_time_str = NULL; + } - this = THIS; + if (!end_time_str) + progress = 1; + else + progress = 0; - //Position of brick could have changed and it could be local now. - //Compute the position again - pos_data = GF_CALLOC (1, sizeof (*pos_data), gf_afr_mt_pos_data_t); - if (!pos_data) - goto out; - pos_data->this = this; - pos_data->child = child; - ret = synctask_new (this->ctx->env, afr_syncop_find_child_position, - afr_handle_child_up, NULL, pos_data); - if (ret) + snprintf (key, sizeof (key), "statistics_end_time-%d-%d-%"PRIu64, + xl_id, child, count); + if (!end_time_str) + end_time_str = gf_strdup ("Could not determine the end time"); + ret = dict_set_dynstr (output, key, end_time_str); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_crawl_end_time to outout"); goto out; -out: - return; -} + } else { + end_time_str = NULL; + } -static int -get_pathinfo_host (char *pathinfo, char *hostname, size_t size) -{ - char *start = NULL; - char *end = NULL; - int ret = -1; - int i = 0; + snprintf (key, sizeof (key), "statistics_inprogress-%d-%d-%"PRIu64, + xl_id, child, count); - if (!pathinfo) + ret = dict_set_int32 (output, key, progress); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_inprogress to outout"); goto out; + } - start = strchr (pathinfo, ':'); - if (!start) + snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child); + ret = dict_set_uint64 (output, key, count + 1); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not increment the counter."); goto out; - end = strrchr (pathinfo, ':'); - if (start == end) - goto out; - - memset (hostname, 0, size); - i = 0; - while (++start != end) - hostname[i++] = *start; - ret = 0; + } out: + GF_FREE (start_time_str); + GF_FREE (end_time_str); return ret; } + int -afr_local_pathinfo (char *pathinfo, gf_boolean_t *local) +afr_shd_dict_add_path (xlator_t *this, dict_t *output, int child, char *path, + struct timeval *tv) { - int ret = 0; - char pathinfohost[1024] = {0}; - char localhost[1024] = {0}; - xlator_t *this = THIS; + int ret = -1; + uint64_t count = 0; + char key[256] = {0}; + int xl_id = 0; - *local = _gf_false; - ret = get_pathinfo_host (pathinfo, pathinfohost, sizeof (pathinfohost)); + ret = dict_get_int32 (output, this->name, &xl_id); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Invalid pathinfo: %s", - pathinfo); + gf_log (this->name, GF_LOG_ERROR, "xl does not have id"); goto out; } - ret = gethostname (localhost, sizeof (localhost)); + snprintf (key, sizeof (key), "%d-%d-count", xl_id, child); + ret = dict_get_uint64 (output, key, &count); + + snprintf (key, sizeof (key), "%d-%d-%"PRIu64, xl_id, child, count); + ret = dict_set_dynstr (output, key, path); + if (ret) { - gf_log (this->name, GF_LOG_ERROR, "gethostname() failed, " - "reason: %s", strerror (errno)); + gf_log (this->name, GF_LOG_ERROR, "%s: Could not add to output", + path); goto out; } - if (!strcmp (localhost, pathinfohost)) - *local = _gf_true; -out: - return ret; -} + if (tv) { + snprintf (key, sizeof (key), "%d-%d-%"PRIu64"-time", xl_id, + child, count); + ret = dict_set_uint32 (output, key, tv->tv_sec); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s: Could not set time", + path); + goto out; + } + } -int -afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data, - loc_t *dirloc) -{ - afr_private_t *priv = NULL; - dict_t *xattr = NULL; - void *index_gfid = NULL; - void *base_indices_holder_vgfid = NULL; - loc_t rootloc = {0}; - struct iatt iattr = {0}; - struct iatt parent = {0}; - int ret = 0; - xlator_t *readdir_xl = crawl_data->readdir_xl; - - priv = this->private; - if (crawl_data->crawl == FULL) { - afr_build_root_loc (this, dirloc); - } else if (crawl_data->crawl == INDEX) { - afr_build_root_loc (this, &rootloc); - ret = syncop_getxattr (readdir_xl, &rootloc, &xattr, - GF_XATTROP_INDEX_GFID); - if (ret < 0) { - ret = -1; - goto out; - } - ret = dict_get_ptr (xattr, GF_XATTROP_INDEX_GFID, &index_gfid); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "failed to get index " - "dir gfid on %s", readdir_xl->name); - goto out; - } - if (!index_gfid) { - gf_log (this->name, GF_LOG_ERROR, "index gfid empty " - "on %s", readdir_xl->name); - ret = -1; - goto out; - } - uuid_copy (dirloc->gfid, index_gfid); - dirloc->path = ""; - dirloc->inode = inode_new (priv->root_inode->table); - ret = syncop_lookup (readdir_xl, dirloc, NULL, - &iattr, NULL, &parent); - if (ret < 0) { - if (-ret != ENOENT) { - gf_log (this->name, GF_LOG_ERROR, "lookup " - "failed on index dir on %s - (%s)", - readdir_xl->name, strerror (-ret)); - } - ret = -1; - goto out; - } - ret = _link_inode_update_loc (this, dirloc, &iattr); - if (ret) - goto out; - } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) { - afr_build_root_loc (this, &rootloc); - ret = syncop_getxattr (readdir_xl, &rootloc, &xattr, - GF_BASE_INDICES_HOLDER_GFID); - if (ret < 0) { - ret = -1; - goto out; - } - ret = dict_get_ptr (xattr, GF_BASE_INDICES_HOLDER_GFID, - &base_indices_holder_vgfid); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "index gfid empty " - "on %s", readdir_xl->name); - ret = -1; - goto out; - } - if (!base_indices_holder_vgfid) { - gf_log (this->name, GF_LOG_ERROR, "Base indices holder" - "virtual gfid is null on %s", readdir_xl->name); - ret = -1; - goto out; - } - uuid_copy (dirloc->gfid, base_indices_holder_vgfid); - dirloc->path = ""; - dirloc->inode = inode_new (priv->root_inode->table); - ret = syncop_lookup (readdir_xl, dirloc, NULL, &iattr, NULL, - &parent); - if (ret < 0) { - if (-ret != ENOENT) { - gf_log (this->name, GF_LOG_ERROR, "lookup " - "failed for base_indices_holder dir" - " on %s - (%s)", readdir_xl->name, - strerror (-ret)); - - } else { - gf_log (this->name, GF_LOG_ERROR, "base_indices" - "_holder is not yet created."); - } - ret = -1; - goto out; - } - ret = _link_inode_update_loc (this, dirloc, &iattr); - if (ret) - goto out; + snprintf (key, sizeof (key), "%d-%d-count", xl_id, child); + + ret = dict_set_uint64 (output, key, count + 1); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not increment count"); + goto out; } + ret = 0; out: - if (xattr) - dict_unref (xattr); - loc_wipe (&rootloc); return ret; } + int -afr_crawl_opendir (xlator_t *this, afr_crawl_data_t *crawl_data, fd_t **dirfd, - loc_t *dirloc) +afr_shd_gfid_to_path (xlator_t *this, xlator_t *subvol, uuid_t gfid, char **path_p) { - fd_t *fd = NULL; - int ret = 0; - - if (crawl_data->crawl == FULL) { - fd = fd_create (dirloc->inode, crawl_data->pid); - if (!fd) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to create fd for %s", dirloc->path); - ret = -1; - goto out; - } - - ret = syncop_opendir (crawl_data->readdir_xl, dirloc, fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "opendir failed on %s", dirloc->path); - ret = -1; - goto out; - } - } else { - fd = fd_anonymous (dirloc->inode); - } - ret = 0; -out: - if (!ret) - *dirfd = fd; - return ret; + loc_t loc = {0,}; + char *path = NULL; + dict_t *xattr = NULL; + int ret = 0; + + uuid_copy (loc.gfid, gfid); + loc.inode = inode_new (this->itable); + + ret = syncop_getxattr (subvol, &loc, &xattr, GFID_TO_PATH_KEY); + loc_wipe (&loc); + if (ret) + return ret; + + ret = dict_get_str (xattr, GFID_TO_PATH_KEY, &path); + if (ret || !path) + return -EINVAL; + + *path_p = gf_strdup (path); + if (!*path_p) + return -ENOMEM; + return 0; } -xlator_t* -afr_crawl_readdir_xl_get (xlator_t *this, afr_crawl_data_t *crawl_data) -{ - afr_private_t *priv = this->private; - - if (crawl_data->crawl == FULL) { - return this; - } else { - return priv->children[crawl_data->child]; - } - return NULL; -} int -afr_crawl_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, - gf_dirent_t *entry, afr_crawl_data_t *crawl_data) +afr_shd_gather_index_entries (xlator_t *this, int child, dict_t *output) { - int ret = -1; - afr_private_t *priv = NULL; - - priv = this->private; - if (crawl_data->crawl == FULL) { - ret = afr_build_child_loc (this, child, parent, entry->d_name); - } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) { - ret = _build_index_loc (this, child, entry->d_name, parent); - if (ret) - goto out; - child->inode = inode_new (priv->root_inode->table); - if (!child->inode) { - ret = -1; - goto out; - } - child->path = NULL; - } else { - child->inode = inode_new (priv->root_inode->table); - if (!child->inode) - goto out; - uuid_parse (entry->d_name, child->gfid); - ret = _loc_assign_gfid_path (child); - } -out: - return ret; + fd_t *fd = NULL; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; + off_t offset = 0; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + uuid_t gfid; + int ret = 0; + int count = 0; + char *path = NULL; + + priv = this->private; + subvol = priv->children[child]; + + fd = afr_shd_index_opendir (this, child); + if (!fd) { + gf_log (this->name, GF_LOG_WARNING, + "unable to opendir index-dir on %s", subvol->name); + return -errno; + } + + INIT_LIST_HEAD (&entries.list); + + while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) { + if (ret > 0) + ret = 0; + list_for_each_entry (entry, &entries.list, list) { + offset = entry->d_off; + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + gf_log (this->name, GF_LOG_DEBUG, "got entry: %s", + entry->d_name); + + ret = uuid_parse (entry->d_name, gfid); + if (ret) + continue; + + path = NULL; + ret = afr_shd_gfid_to_path (this, subvol, gfid, &path); + + if (ret == -ENOENT || ret == -ESTALE) { + afr_shd_index_purge (subvol, fd->inode, + entry->d_name); + ret = 0; + continue; + } + + ret = afr_shd_dict_add_path (this, output, child, path, + NULL); + } + + gf_dirent_free (&entries); + if (ret) + break; + } + + if (fd) + fd_unref (fd); + if (!ret) + ret = count; + return ret; } -static int -_process_entries (xlator_t *this, loc_t *parentloc, gf_dirent_t *entries, - off_t *offset, afr_crawl_data_t *crawl_data) -{ - gf_dirent_t *entry = NULL; - gf_dirent_t *tmp = NULL; - int ret = 0; - loc_t entry_loc = {0}; - fd_t *fd = NULL; - struct iatt iattr = {0}; - - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - if (!_crawl_proceed (this, crawl_data->child, - crawl_data->crawl_flags, NULL)) { - ret = -1; - goto out; - } - *offset = entry->d_off; - if (IS_ENTRY_CWD (entry->d_name) || - IS_ENTRY_PARENT (entry->d_name)) - continue; - if ((crawl_data->crawl == FULL) && - uuid_is_null (entry->d_stat.ia_gfid)) { - gf_log (this->name, GF_LOG_WARNING, "%s/%s: No " - "gfid present skipping", - parentloc->path, entry->d_name); - continue; - } - - loc_wipe (&entry_loc); - ret = afr_crawl_build_child_loc (this, &entry_loc, parentloc, - entry, crawl_data); - if (ret) - goto out; - ret = crawl_data->process_entry (this, crawl_data, entry, - &entry_loc, parentloc, &iattr); - - if (crawl_data->crawl == INDEX_TO_BE_HEALED && ret) { - goto out; - } else if (ret) { - continue; - } - - if ((crawl_data->crawl == INDEX) || - (crawl_data->crawl == INDEX_TO_BE_HEALED)) - continue; - - if (!IA_ISDIR (iattr.ia_type)) - continue; - fd = NULL; - ret = afr_crawl_opendir (this, crawl_data, &fd, &entry_loc); - if (ret) - continue; - ret = _crawl_directory (fd, &entry_loc, crawl_data); - if (fd) - fd_unref (fd); - } - ret = 0; -out: - if ((crawl_data->crawl == INDEX_TO_BE_HEALED) && ret) { - gf_log (this->name, GF_LOG_ERROR,"Failed to get the hardlink " - "count"); - } - loc_wipe (&entry_loc); - return ret; +int +afr_add_shd_event (circular_buffer_t *cb, void *data) +{ + dict_t *output = NULL; + xlator_t *this = THIS; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + shd_event_t *shd_event = NULL; + char *path = NULL; + + output = data; + priv = this->private; + shd = &priv->shd; + shd_event = cb->data; + + if (!shd->index_healers[shd_event->child].local) + return 0; + + path = gf_strdup (shd_event->path); + if (!path) + return -ENOMEM; + + afr_shd_dict_add_path (this, output, shd_event->child, path, + &cb->tv); + return 0; } -static int -_crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data) +int +afr_add_crawl_event (circular_buffer_t *cb, void *data) { - xlator_t *this = NULL; - off_t offset = 0; - gf_dirent_t entries; - int ret = 0; - gf_boolean_t free_entries = _gf_false; - xlator_t *readdir_xl = crawl_data->readdir_xl; + dict_t *output = NULL; + xlator_t *this = THIS; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + crawl_event_t *crawl_event = NULL; - INIT_LIST_HEAD (&entries.list); - this = THIS; + output = data; + priv = this->private; + shd = &priv->shd; + crawl_event = cb->data; - GF_ASSERT (loc->inode); + if (!shd->index_healers[crawl_event->child].local) + return 0; - if (crawl_data->crawl == FULL) - gf_log (this->name, GF_LOG_DEBUG, "crawling %s", loc->path); - else - gf_log (this->name, GF_LOG_DEBUG, "crawling INDEX %s", - uuid_utoa (loc->gfid)); - - while (1) { - if (crawl_data->crawl == FULL) - ret = syncop_readdirp (readdir_xl, fd, 131072, offset, - NULL, &entries); - else - ret = syncop_readdir (readdir_xl, fd, 131072, offset, - &entries); - if (ret < 0) { - ret = -1; - break; - } else if (ret == 0) { - break; - } - - ret = 0; - free_entries = _gf_true; - - if (!_crawl_proceed (this, crawl_data->child, - crawl_data->crawl_flags, NULL)) { - ret = -1; - goto out; - } - if (list_empty (&entries.list)) - goto out; + afr_shd_dict_add_crawl_event (this, output, crawl_event); - ret = _process_entries (this, loc, &entries, &offset, - crawl_data); - if ((ret < 0) && (crawl_data->crawl == INDEX_TO_BE_HEALED)) { - goto out; - } - gf_dirent_free (&entries); - free_entries = _gf_false; - } - ret = 0; -out: - if (free_entries) - gf_dirent_free (&entries); - return ret; + return 0; } -static char* -position_str_get (afr_child_pos_t pos) -{ - switch (pos) { - case AFR_POS_UNKNOWN: - return "unknown"; - case AFR_POS_LOCAL: - return "local"; - case AFR_POS_REMOTE: - return "remote"; - } - return NULL; -} int -afr_find_child_position (xlator_t *this, int child, afr_child_pos_t *pos) +afr_selfheal_daemon_init (xlator_t *this) { - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - dict_t *xattr_rsp = NULL; - loc_t loc = {0}; - int ret = 0; - char *node_uuid = NULL; - - priv = this->private; - shd = &priv->shd; - - afr_build_root_loc (this, &loc); - - ret = syncop_getxattr (priv->children[child], &loc, &xattr_rsp, - GF_XATTR_NODE_UUID_KEY); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "getxattr failed on %s - " - "(%s)", priv->children[child]->name, strerror (-ret)); - ret = -1; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int ret = -1; + int i = 0; + + priv = this->private; + shd = &priv->shd; + + this->itable = inode_table_new (SHD_INODE_LRU_LIMIT, this); + if (!this->itable) + goto out; + + shd->index_healers = GF_CALLOC (sizeof(*shd->index_healers), + priv->child_count, + gf_afr_mt_subvol_healer_t); + if (!shd->index_healers) + goto out; + + for (i = 0; i < priv->child_count; i++) { + shd->index_healers[i].subvol = i; + ret = afr_shd_healer_init (this, &shd->index_healers[i]); + if (ret) + goto out; + } + + shd->full_healers = GF_CALLOC (sizeof(*shd->full_healers), + priv->child_count, + gf_afr_mt_subvol_healer_t); + if (!shd->full_healers) + goto out; + for (i = 0; i < priv->child_count; i++) { + shd->full_healers[i].subvol = i; + ret = afr_shd_healer_init (this, &shd->full_healers[i]); + if (ret) + goto out; + } + + shd->healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false, + afr_destroy_shd_event_data); + if (!shd->healed) + goto out; + + shd->heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false, + afr_destroy_shd_event_data); + if (!shd->heal_failed) + goto out; + + shd->split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false, + afr_destroy_shd_event_data); + if (!shd->split_brain) + goto out; + + shd->statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count, + gf_common_mt_eh_t); + if (!shd->statistics) goto out; - } - ret = dict_get_str (xattr_rsp, GF_XATTR_NODE_UUID_KEY, &node_uuid); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "node-uuid key not found on " - "child %s", priv->children[child]->name); - goto out; + for (i = 0; i < priv->child_count ; i++) { + shd->statistics[i] = eh_new (AFR_STATISTICS_HISTORY_SIZE, + _gf_false, + afr_destroy_crawl_event_data); + if (!shd->statistics[i]) + goto out; + shd->full_healers[i].crawl_event.child = i; + shd->full_healers[i].crawl_event.crawl_type = "FULL"; + shd->index_healers[i].crawl_event.child = i; + shd->index_healers[i].crawl_event.crawl_type = "INDEX"; } - if (!strcmp (node_uuid, shd->node_uuid)) - *pos = AFR_POS_LOCAL; - else - *pos = AFR_POS_REMOTE; - - gf_log (this->name, GF_LOG_DEBUG, "child %s is %s", - priv->children[child]->name, position_str_get (*pos)); + ret = 0; out: - if (ret) - *pos = AFR_POS_UNKNOWN; - loc_wipe (&loc); - return ret; + return ret; } + int -afr_syncop_find_child_position (void *data) +afr_selfheal_childup (xlator_t *this, int subvol) { - shd_pos_t *pos_data = data; - int ret = 0; + afr_shd_index_healer_spawn (this, subvol); - ret = afr_find_child_position (pos_data->this, pos_data->child, - &pos_data->pos); - return ret; + return 0; } -static int -afr_dir_crawl (void *data) -{ - xlator_t *this = NULL; - int ret = -1; - xlator_t *readdir_xl = NULL; - fd_t *fd = NULL; - loc_t dirloc = {0}; - afr_crawl_data_t *crawl_data = data; - - this = THIS; - - if (!_crawl_proceed (this, crawl_data->child, crawl_data->crawl_flags, - NULL)) - goto out; - - readdir_xl = afr_crawl_readdir_xl_get (this, crawl_data); - if (!readdir_xl) - goto out; - crawl_data->readdir_xl = readdir_xl; - ret = afr_crawl_build_start_loc (this, crawl_data, &dirloc); - if (ret) - goto out; - - ret = afr_crawl_opendir (this, crawl_data, &fd, &dirloc); - if (ret) { - if (crawl_data->crawl == INDEX_TO_BE_HEALED) { - gf_log (this->name, GF_LOG_ERROR, "Failed to open base_" - "indices_holder"); - } - goto out; - } - - ret = _crawl_directory (fd, &dirloc, crawl_data); - if (ret) - gf_log (this->name, GF_LOG_ERROR, "Crawl failed on %s", - readdir_xl->name); - else - gf_log (this->name, GF_LOG_DEBUG, "Crawl completed " - "on %s", readdir_xl->name); - if (crawl_data->crawl == INDEX) - dirloc.path = NULL; -out: - if (fd) - fd_unref (fd); - if ((crawl_data->crawl == INDEX) || - (crawl_data->crawl == INDEX_TO_BE_HEALED )) - dirloc.path = NULL; - loc_wipe (&dirloc); - return ret; -} - -char * -get_crawl_type_in_string (afr_crawl_type_t crawl) +int64_t +afr_shd_get_index_count (xlator_t *this, int i) { - char *index = "INDEX"; - char *full = "FULL"; - char *crawl_type = NULL; - - if (crawl == INDEX){ - crawl_type = index; - } else if (crawl == FULL) { - crawl_type = full; - } - - return crawl_type; -} - -static int -afr_allocate_crawl_event (xlator_t *this, int child, afr_crawl_type_t crawl) -{ - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - int ret = 0; - shd_crawl_event_t *crawl_event = NULL; - time_t get_time = 0; - - priv = this->private; - shd = &priv->shd; - - crawl_event = GF_CALLOC (sizeof (shd_crawl_event_t), 1, - gf_afr_mt_shd_crawl_event_t); - if (!crawl_event) { - ret = -1; - goto out; - } - - get_time = time(NULL); - if (get_time == ((time_t)-1)) { - ret = -1; - goto out; - } - - crawl_event->start_time_str = gf_strdup (ctime(&get_time)); - - crawl_event->crawl_type = get_crawl_type_in_string (crawl); - if (!crawl_event->crawl_type) { - ret = -1; - goto out; - } - LOCK (&priv->lock); - { - shd->crawl_events[child] = crawl_event; - } - UNLOCK (&priv->lock); - ret = 0; -out: - return ret; - + afr_private_t *priv = NULL; + xlator_t *subvol = NULL; + uint64_t count = 0; + loc_t rootloc = {0, }; + dict_t *xattr = NULL; + int ret = -1; + + priv = this->private; + subvol = priv->children[i]; + + rootloc.inode = inode_ref (this->itable->root); + uuid_copy (rootloc.gfid, rootloc.inode->gfid); + + ret = syncop_getxattr (subvol, &rootloc, &xattr, + GF_XATTROP_INDEX_COUNT); + loc_wipe (&rootloc); + + if (ret < 0) + return -1; + + ret = dict_get_uint64 (xattr, GF_XATTROP_INDEX_COUNT, &count); + if (ret) + return -1; + return count; } -static int -afr_put_crawl_event_in_eh (xlator_t *this, int child) -{ - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - int ret = 0; - time_t get_time = 0; - shd_crawl_event_t **crawl_event = NULL; - - priv = this->private; - shd = &priv->shd; - - get_time = time(NULL); - if (get_time == ((time_t)-1)) { - ret = -1; - goto out; - } - crawl_event = (shd_crawl_event_t**)shd->crawl_events; - LOCK (&priv->lock); - { - crawl_event[child]->end_time_str = gf_strdup (ctime(&get_time)); - ret = eh_save_history (shd->statistics[child], - crawl_event[child]); - crawl_event[child] = NULL; - } - UNLOCK (&priv->lock); -out: - return ret; -} -static int -afr_dir_exclusive_crawl (void *data) +int +afr_xl_op (xlator_t *this, dict_t *input, dict_t *output) { - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - gf_boolean_t crawl = _gf_false; + gf_xl_afr_op_t op = GF_AFR_OP_INVALID; int ret = 0; - int child = -1; - xlator_t *this = NULL; - afr_crawl_data_t *crawl_data = data; - - this = THIS; - priv = this->private; - shd = &priv->shd; - child = crawl_data->child; - - LOCK (&priv->lock); - { - if (shd->inprogress[child]) { - if (shd->pending[child] != FULL) - shd->pending[child] = crawl_data->crawl; - } else { - shd->inprogress[child] = _gf_true; - crawl = _gf_true; - } - } - UNLOCK (&priv->lock); - - if (!crawl) { - gf_log (this->name, GF_LOG_INFO, "Another crawl is in progress " - "for %s while attempting %s heal on %s", - priv->children[child]->name, - get_crawl_type_in_string (crawl_data->crawl), - priv->children[child]->name); - goto out; - } - - do { - ret = afr_allocate_crawl_event (this, child, crawl_data->crawl); - if (ret) - goto out; - afr_dir_crawl (data); - - ret = afr_put_crawl_event_in_eh (this, child); - if (ret < 0) - goto out; - - LOCK (&priv->lock); - { - if (shd->pending[child] != NONE) { - crawl_data->crawl = shd->pending[child]; - shd->pending[child] = NONE; - } else { - shd->inprogress[child] = _gf_false; - crawl = _gf_false; - } - } - UNLOCK (&priv->lock); - } while (crawl); -out: - return ret; -} + int xl_id = 0; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + struct subvol_healer *healer = NULL; + int i = 0; + char key[64]; + int op_ret = 0; + int64_t cnt = 0; -void -afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl, - process_entry_cbk_t process_entry, void *op_data, - gf_boolean_t exclusive, int crawl_flags, - afr_crawl_done_cbk_t crawl_done) -{ - afr_private_t *priv = NULL; - call_frame_t *frame = NULL; - afr_crawl_data_t *crawl_data = NULL; - int ret = 0; - int (*crawler) (void*) = NULL; + priv = this->private; + shd = &priv->shd; - priv = this->private; + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == -1) + goto out; - frame = create_frame (this, this->ctx->pool); - if (!frame) + ret = dict_get_int32 (input, "xl-op", (int32_t*)&op); + if (ret) goto out; - - afr_set_lk_owner (frame, this, frame->root); - afr_set_low_priority (frame); - crawl_data = GF_CALLOC (1, sizeof (*crawl_data), - gf_afr_mt_crawl_data_t); - if (!crawl_data) + ret = dict_get_int32 (input, this->name, &xl_id); + if (ret) goto out; - crawl_data->process_entry = process_entry; - crawl_data->child = idx; - crawl_data->pid = frame->root->pid; - crawl_data->crawl = crawl; - crawl_data->op_data = op_data; - crawl_data->crawl_flags = crawl_flags; - gf_log (this->name, GF_LOG_DEBUG, "starting crawl %d for %s", - crawl_data->crawl, priv->children[idx]->name); - - if (exclusive) - crawler = afr_dir_exclusive_crawl; - else - crawler = afr_dir_crawl; - ret = synctask_new (this->ctx->env, crawler, - crawl_done, frame, crawl_data); + ret = dict_set_int32 (output, this->name, xl_id); if (ret) - gf_log (this->name, GF_LOG_ERROR, "afr crawl failed for child" - " %d with ret %d", idx, ret); -out: - return; -} - -void -afr_build_root_loc (xlator_t *this, loc_t *loc) -{ - afr_private_t *priv = NULL; - - priv = this->private; - loc->path = gf_strdup ("/"); - loc->name = ""; - loc->inode = inode_ref (priv->root_inode); - uuid_copy (loc->gfid, loc->inode->gfid); -} - -int -afr_set_root_gfid (dict_t *dict) -{ - uuid_t gfid; - int ret = 0; - - memset (gfid, 0, 16); - gfid[15] = 1; - - ret = afr_set_dict_gfid (dict, gfid); + goto out; + switch (op) { + case GF_AFR_OP_HEAL_INDEX: + op_ret = -1; + + for (i = 0; i < priv->child_count; i++) { + healer = &shd->index_healers[i]; + snprintf (key, 64, "%d-%d-status", xl_id, i); + + if (!priv->child_up[i]) { + ret = dict_set_str (output, key, + "Brick is not connected"); + } else if (AFR_COUNT (priv->child_up, + priv->child_count) < 2) { + ret = dict_set_str (output, key, + "< 2 bricks in replica are up"); + } else if (!afr_shd_is_subvol_local (this, healer->subvol)) { + ret = dict_set_str (output, key, + "Brick is remote"); + } else { + ret = dict_set_str (output, key, + "Started self-heal"); + afr_shd_index_healer_spawn (this, i); + op_ret = 0; + } + } + break; + case GF_AFR_OP_HEAL_FULL: + op_ret = -1; + + for (i = 0; i < priv->child_count; i++) { + healer = &shd->full_healers[i]; + snprintf (key, 64, "%d-%d-status", xl_id, i); + + if (!priv->child_up[i]) { + ret = dict_set_str (output, key, + "Brick is not connected"); + } else if (AFR_COUNT (priv->child_up, + priv->child_count) < 2) { + ret = dict_set_str (output, key, + "< 2 bricks in replica are up"); + } else if (!afr_shd_is_subvol_local (this, healer->subvol)) { + ret = dict_set_str (output, key, + "Brick is remote"); + } else { + ret = dict_set_str (output, key, + "Started self-heal"); + afr_shd_full_healer_spawn (this, i); + op_ret = 0; + } + } + break; + case GF_AFR_OP_INDEX_SUMMARY: + for (i = 0; i < priv->child_count; i++) + if (shd->index_healers[i].local) + afr_shd_gather_index_entries (this, i, output); + break; + case GF_AFR_OP_HEALED_FILES: + eh_dump (shd->healed, output, afr_add_shd_event); + break; + case GF_AFR_OP_HEAL_FAILED_FILES: + eh_dump (shd->heal_failed, output, afr_add_shd_event); + break; + case GF_AFR_OP_SPLIT_BRAIN_FILES: + eh_dump (shd->split_brain, output, afr_add_shd_event); + break; + case GF_AFR_OP_STATISTICS: + for (i = 0; i < priv->child_count; i++) { + eh_dump (shd->statistics[i], output, + afr_add_crawl_event); + afr_shd_dict_add_crawl_event (this, output, + &shd->index_healers[i].crawl_event); + afr_shd_dict_add_crawl_event (this, output, + &shd->full_healers[i].crawl_event); + } + break; + case GF_AFR_OP_STATISTICS_HEAL_COUNT: + case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA: + op_ret = -1; + + for (i = 0; i < priv->child_count; i++) { + if (!priv->child_up[i]) { + snprintf (key, 64, "%d-%d-status", xl_id, i); + ret = dict_set_str (output, key, + "Brick is not connected"); + } else { + snprintf (key, 64, "%d-%d-hardlinks", xl_id, i); + cnt = afr_shd_get_index_count (this, i); + if (cnt >= 0) { + ret = dict_set_uint64 (output, key, cnt); + } + op_ret = 0; + } + } + +// ret = _do_crawl_op_on_local_subvols (this, INDEX_TO_BE_HEALED, +// STATISTICS_TO_BE_HEALED, +// output); + break; - return ret; + default: + gf_log (this->name, GF_LOG_ERROR, "Unknown set op %d", op); + break; + } +out: + dict_del (output, this->name); + return op_ret; } diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h index e0c083754e0..10e229ee7c2 100644 --- a/xlators/cluster/afr/src/afr-self-heald.h +++ b/xlators/cluster/afr/src/afr-self-heald.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,58 +8,65 @@ cases as published by the Free Software Foundation. */ -#ifndef __AFR_SELF_HEALD_H__ -#define __AFR_SELF_HEALD_H__ -#include "xlator.h" - -#define IS_ROOT_PATH(path) (!strcmp (path, "/")) -#define IS_ENTRY_CWD(entry) (!strcmp (entry, ".")) -#define IS_ENTRY_PARENT(entry) (!strcmp (entry, "..")) -#define AFR_ALL_CHILDREN -1 - -typedef struct afr_crawl_data_ { - int child; - pid_t pid; - afr_crawl_type_t crawl; - xlator_t *readdir_xl; - void *op_data; - int crawl_flags; - int (*process_entry) (xlator_t *this, struct afr_crawl_data_ *crawl_data, - gf_dirent_t *entry, loc_t *child, loc_t *parent, - struct iatt *iattr); -} afr_crawl_data_t; - -typedef struct crawl_event_stats_ { - uint64_t healed_count; + +#ifndef _AFR_SELF_HEALD_H +#define _AFR_SELF_HEALD_H + +#include <pthread.h> + + +typedef struct { + int child; + char *path; +} shd_event_t; + +typedef struct { + int child; + uint64_t healed_count; uint64_t split_brain_count; uint64_t heal_failed_count; - char *start_time_str; - char *end_time_str; + + /* If start_time is 0, it means crawler is not in progress + and stats are not valid */ + time_t start_time; + /* If start_time is NOT 0 and end_time is 0, it means + cralwer is in progress */ + time_t end_time; char *crawl_type; - gf_boolean_t crawl_inprogress; -} shd_crawl_event_t; +} crawl_event_t; -void _destroy_crawl_event_data (void *data); -void _destroy_shd_event_data (void *data); +struct subvol_healer { + xlator_t *this; + int subvol; + gf_boolean_t local; + gf_boolean_t running; + gf_boolean_t rerun; + crawl_event_t crawl_event; + pthread_mutex_t mutex; + pthread_cond_t cond; + pthread_t thread; +}; -typedef int (*process_entry_cbk_t) (xlator_t *this, afr_crawl_data_t *crawl_data, - gf_dirent_t *entry, loc_t *child, loc_t *parent, - struct iatt *iattr); +typedef struct { + gf_boolean_t iamshd; + gf_boolean_t enabled; + struct subvol_healer *index_healers; + struct subvol_healer *full_healers; -void afr_build_root_loc (xlator_t *this, loc_t *loc); + eh_t *healed; + eh_t *heal_failed; + eh_t *split_brain; + eh_t **statistics; +} afr_self_heald_t; -int afr_set_root_gfid (dict_t *dict); -void -afr_proactive_self_heal (void *data); +int +afr_selfheal_childup (xlator_t *this, int subvol); int -afr_xl_op (xlator_t *this, dict_t *input, dict_t *output); +afr_selfheal_daemon_init (xlator_t *this); -/* - * In addition to its self-heal use, this is used to find a local default - * read_child. - */ int -afr_local_pathinfo (char *pathinfo, gf_boolean_t *local); -#endif /* __AFR_SELF_HEALD_H__ */ +afr_xl_op (xlator_t *this, dict_t *input, dict_t *output); + +#endif /* !_AFR_SELF_HEALD_H */ diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 20306e46924..f974fdb596b 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -18,188 +18,130 @@ #include <signal.h> +gf_boolean_t +afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this); + +gf_boolean_t +afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this); + +int +afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, + afr_changelog_resume_t changelog_resume); -#define LOCKED_NO 0x0 /* no lock held */ -#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path - of RENAME */ -#define LOCKED_LOWER 0x2 /* for lower_path of RENAME */ -afr_fd_ctx_t * -__afr_fd_ctx_get (fd_t *fd, xlator_t *this) +int +__afr_txn_write_fop (call_frame_t *frame, xlator_t *this) { - uint64_t ctx = 0; - int ret = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int i = 0; + afr_local_t *local = NULL; afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + local = frame->local; priv = this->private; - ret = __fd_ctx_get (fd, this, &ctx); - - if (ret < 0 && fd_is_anonymous (fd)) { - ret = __afr_fd_ctx_set (this, fd); - if (ret < 0) - goto out; - - ret = __fd_ctx_get (fd, this, &ctx); - if (ret < 0) - goto out; + call_count = AFR_COUNT (local->transaction.pre_op, priv->child_count); - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - for (i = 0; i < priv->child_count; i++) - fd_ctx->opened_on[i] = AFR_FD_OPENED; + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; } - fd_ctx = (afr_fd_ctx_t *)(long) ctx; -out: - return fd_ctx; -} - + local->call_count = call_count; -afr_fd_ctx_t * -afr_fd_ctx_get (fd_t *fd, xlator_t *this) -{ - afr_fd_ctx_t *fd_ctx = NULL; + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + local->transaction.wind (frame, this, i); - LOCK(&fd->lock); - { - fd_ctx = __afr_fd_ctx_get (fd, this); + if (!--call_count) + break; + } } - UNLOCK(&fd->lock); - return fd_ctx; + return 0; } -static void -afr_save_lk_owner (call_frame_t *frame) +int +__afr_txn_write_done (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; + afr_local_t *local = NULL; local = frame->local; - local->saved_lk_owner = frame->root->lk_owner; + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; } -static void -afr_restore_lk_owner (call_frame_t *frame) +call_frame_t* +afr_transaction_detach_fop_frame (call_frame_t *frame) { - afr_local_t * local = NULL; + afr_local_t * local = NULL; + call_frame_t *fop_frame = NULL; local = frame->local; - frame->root->lk_owner = local->saved_lk_owner; -} - -static void -__mark_all_pending (int32_t *pending[], int child_count, - afr_transaction_type type) -{ - int i = 0; - int j = 0; - - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); - pending[i][j] = hton32 (1); + LOCK (&frame->lock); + { + fop_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; } + UNLOCK (&frame->lock); + + return fop_frame; } static void -__mark_child_dead (int32_t *pending[], int child_count, int child, - afr_transaction_type type) +afr_save_lk_owner (call_frame_t *frame) { - int j = 0; + afr_local_t * local = NULL; - j = afr_index_for_transaction_type (type); + local = frame->local; - pending[child][j] = 0; + local->saved_lk_owner = frame->root->lk_owner; } static void -__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index) +afr_restore_lk_owner (call_frame_t *frame) { - afr_local_t *local = NULL; - afr_fd_ctx_t *fd_ctx = NULL; + afr_local_t * local = NULL; local = frame->local; - if (!local->fd) - return; - - fd_ctx = afr_fd_ctx_get (local->fd, this); - - if (!fd_ctx) - goto out; - - LOCK (&local->fd->lock); - { - if (local->transaction.type == AFR_DATA_TRANSACTION) - fd_ctx->pre_op_done[child_index]++; - } - UNLOCK (&local->fd->lock); -out: - return; -} - -static void -__mark_non_participant_children (int32_t *pending[], int child_count, - unsigned char *participants, - afr_transaction_type type) -{ - int i = 0; - int j = 0; - - j = afr_index_for_transaction_type (type); - for (i = 0; i < child_count; i++) { - if (!participants[i]) - pending[i][j] = 0; - } + frame->root->lk_owner = local->saved_lk_owner; } - void -__mark_all_success (int32_t *pending[], int child_count, - afr_transaction_type type) +__mark_all_success (call_frame_t *frame, xlator_t *this) { - int i; - int j; - - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); - pending[i][j] = hton32 (-1); - } -} + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i; -void -_set_all_child_errno (int *child_errno, unsigned int child_count) -{ - int i = 0; + local = frame->local; + priv = this->private; - for (i = 0; i < child_count; i++) - if (child_errno[i] == 0) - child_errno[i] = ENOTCONN; + for (i = 0; i < priv->child_count; i++) { + local->transaction.failed_subvols[i] = 0; + } } -void + +int afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; - afr_private_t *priv = NULL; fd_t *fd = NULL; local = frame->local; - priv = this->private; fd = local->fd; - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); - - _set_all_child_errno (local->child_errno, priv->child_count); - /* Perform fops with the lk-owner from top xlator. * Eg: lk-owner of posix-lk and flush should be same, * flush cant clear the posix-lks without that lk-owner. @@ -208,6 +150,10 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) frame->root->lk_owner = local->transaction.main_frame->root->lk_owner; + if (local->pre_op_compat) + /* old mode, pre-op was done as afr_changelog_do() + just now, before OP */ + afr_changelog_pre_op_update (frame, this); /* The wake up needs to happen independent of what type of fop arrives here. If it was @@ -220,6 +166,8 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) if (fd) afr_delayed_changelog_wake_up (this, fd); local->transaction.fop (frame, this); + + return 0; } @@ -285,39 +233,28 @@ __fop_changelog_needed (call_frame_t *frame, xlator_t *this) return op_ret; } + int -afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, - int child, afr_xattrop_type_t op) +afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int **pending) { int i = 0; int ret = 0; + int pending_zero[AFR_NUM_CHANGE_LOGS] = {0, }; - if (op == LOCAL_FIRST) { - ret = dict_set_static_bin (xattr, priv->pending_key[child], - pending[child], - AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); - if (ret) - goto out; - } for (i = 0; i < priv->child_count; i++) { - if (i == child) - continue; + if (!memcmp (pending_zero, pending[i], sizeof (pending_zero))) + /* don't set xattrs for non-pending servers */ + continue; + ret = dict_set_static_bin (xattr, priv->pending_key[i], - pending[i], - AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); + pending[i], + AFR_NUM_CHANGE_LOGS * sizeof (int)); /* 3 = data+metadata+entry */ - if (ret < 0) - goto out; - } - if (op == LOCAL_LAST) { - ret = dict_set_static_bin (xattr, priv->pending_key[child], - pending[child], - AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); if (ret) - goto out; + break; } -out: + return ret; } @@ -346,102 +283,34 @@ afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) /* {{{ pending */ -int32_t -afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr, - dict_t *xdata) + +int +afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this) { + afr_local_t *local = NULL; + afr_private_t *priv = NULL; afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int call_count = -1; - priv = this->private; - local = frame->local; + local = frame->local; + priv = this->private; int_lock = &local->internal_lock; - LOCK (&frame->lock); - { - call_count = --local->call_count; - } - UNLOCK (&frame->lock); - - if (call_count == 0) { - if (local->transaction.resume_stub) { - call_resume (local->transaction.resume_stub); - local->transaction.resume_stub = NULL; - } + if (local->transaction.resume_stub) { + call_resume (local->transaction.resume_stub); + local->transaction.resume_stub = NULL; + } - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - local->transaction.done (frame, this); - } else { - int_lock->lock_cbk = local->transaction.done; - afr_unlock (frame, this); - } - } + if (afr_lock_server_count (priv, local->transaction.type) == 0) { + local->transaction.done (frame, this); + } else { + int_lock->lock_cbk = local->transaction.done; + afr_unlock (frame, this); + } - return 0; + return 0; } -void -afr_transaction_rm_stale_children (call_frame_t *frame, xlator_t *this, - inode_t *inode, afr_transaction_type type) -{ - int i = -1; - int count = 0; - int read_child = -1; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int **pending = NULL; - int idx = 0; - int32_t *stale_children = NULL; - int32_t *fresh_children = NULL; - gf_boolean_t rm_stale_children = _gf_false; - - idx = afr_index_for_transaction_type (type); - - priv = this->private; - local = frame->local; - pending = local->pending; - - if (local->op_ret < 0) - goto out; - fresh_children = local->fresh_children; - read_child = afr_inode_get_read_ctx (this, inode, fresh_children); - if (read_child < 0) { - gf_log (this->name, GF_LOG_DEBUG, "Possible split-brain " - "for %s", uuid_utoa (inode->gfid)); - goto out; - } - - for (i = 0; i < priv->child_count; i++) { - if (!afr_is_child_present (fresh_children, - priv->child_count, i)) - continue; - if (pending[i][idx]) - continue; - /* child is down or op failed on it */ - if (!stale_children) - stale_children = afr_children_create (priv->child_count); - if (!stale_children) - goto out; - - rm_stale_children = _gf_true; - stale_children[count++] = i; - gf_log (this->name, GF_LOG_DEBUG, "Removing stale child " - "%d for %s", i, uuid_utoa (inode->gfid)); - } - - if (!rm_stale_children) - goto out; - - afr_inode_rm_stale_children (this, inode, stale_children); -out: - GF_FREE (stale_children); - return; -} - afr_inodelk_t* afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom) { @@ -478,423 +347,468 @@ afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock) return locked_nodes; } + int -afr_changelog_pre_op_call_count (afr_transaction_type type, - afr_internal_lock_t *int_lock, - unsigned int child_count) +afr_changelog_call_count (afr_transaction_type type, + unsigned char *pre_op_subvols, + unsigned int child_count) { - int call_count = 0; - unsigned char *locked_nodes = NULL; + int call_count = 0; - locked_nodes = afr_locked_nodes_get (type, int_lock); - GF_ASSERT (locked_nodes); + call_count = AFR_COUNT(pre_op_subvols, child_count); - call_count = afr_locked_children_count (locked_nodes, child_count); if (type == AFR_ENTRY_RENAME_TRANSACTION) call_count *= 2; return call_count; } -int -afr_changelog_post_op_call_count (afr_transaction_type type, - unsigned char *pre_op, - unsigned int child_count) -{ - int call_count = 0; - call_count = afr_pre_op_done_children_count (pre_op, child_count); - if (type == AFR_ENTRY_RENAME_TRANSACTION) - call_count *= 2; +gf_boolean_t +afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; - return call_count; -} + local = frame->local; + priv = this->private; -void -afr_compute_txn_changelog (afr_local_t *local, afr_private_t *priv) -{ - int i = 0; - int index = 0; - int32_t postop = 0; - int32_t preop = 1; - int32_t **txn_changelog = NULL; - - txn_changelog = local->transaction.txn_changelog; - index = afr_index_for_transaction_type (local->transaction.type); for (i = 0; i < priv->child_count; i++) { - postop = ntoh32 (local->pending[i][index]); - txn_changelog[i][index] = hton32 (postop + preop); + if (local->transaction.failed_subvols[i]) + return _gf_false; } -} -afr_xattrop_type_t -afr_get_postop_xattrop_type (int32_t **pending, int optimized, int child, - afr_transaction_type type) -{ - int index = 0; - afr_xattrop_type_t op = LOCAL_LAST; - - index = afr_index_for_transaction_type (type); - if (optimized && !pending[child][index]) - op = LOCAL_FIRST; - return op; + return _gf_true; } + void -afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr, - int optimized, int child) +afr_handle_symmetric_errors (call_frame_t *frame, xlator_t *this) { - int32_t **txn_changelog = NULL; - int32_t **changelog = NULL; - afr_private_t *priv = NULL; - int ret = 0; - afr_xattrop_type_t op = LOCAL_LAST; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int op_errno = 0; + int i_errno = 0; + gf_boolean_t matching_errors = _gf_true; + int i = 0; - priv = this->private; - txn_changelog = local->transaction.txn_changelog; - op = afr_get_postop_xattrop_type (local->pending, optimized, child, - local->transaction.type); - if (optimized) - changelog = txn_changelog; - else - changelog = local->pending; - ret = afr_set_pending_dict (priv, xattr, changelog, child, op); - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); + priv = this->private; + local = frame->local; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret != -1) { + /* Operation succeeded on at least on subvol, + so it is not a failed-everywhere situation. + */ + matching_errors = _gf_false; + break; + } + i_errno = local->replies[i].op_errno; + + if (i_errno == ENOTCONN) { + /* ENOTCONN is not a symmetric error. We do not + know if the operation was performed on the + backend or not. + */ + matching_errors = _gf_false; + break; + } + + if (!op_errno) { + op_errno = i_errno; + } else if (op_errno != i_errno) { + /* Mismatching op_errno's */ + matching_errors = _gf_false; + break; + } + } + + if (matching_errors) + __mark_all_success (frame, this); } -gf_boolean_t -afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) +int +afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int index = -1; - int i = 0; + afr_private_t * priv = this->private; + int i = 0; + int ret = 0; + int idx = 0; + afr_local_t * local = NULL; + dict_t *xattr = NULL; + int nothing_failed = 1; + gf_boolean_t need_undirty = _gf_false; local = frame->local; - priv = this->private; + idx = afr_index_for_transaction_type (local->transaction.type); - index = afr_index_for_transaction_type (local->transaction.type); + nothing_failed = afr_txn_nothing_failed (frame, this); - for (i = 0; i < priv->child_count; i++) { - if (local->pending[i][index] == 0) - return _gf_false; - } + if (afr_changelog_pre_op_uninherit (frame, this)) + need_undirty = _gf_false; + else + need_undirty = _gf_true; - return _gf_true; -} + if (nothing_failed && !need_undirty) { + afr_changelog_post_op_done (frame, this); + goto out; + } -static void -afr_dir_fop_handle_all_fop_failures (call_frame_t *frame) -{ - xlator_t *this = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + xattr = dict_new (); + if (!xattr) { + local->op_ret = -1; + local->op_errno = ENOMEM; + afr_changelog_post_op_done (frame, this); + goto out; + } - this = frame->this; - local = frame->local; - priv = this->private; + if (need_undirty) { + local->dirty[idx] = hton32(-1); - if ((local->transaction.type != AFR_ENTRY_TRANSACTION) && - (local->transaction.type != AFR_ENTRY_RENAME_TRANSACTION)) - return; + ret = dict_set_static_bin (xattr, AFR_DIRTY, local->dirty, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + local->op_ret = -1; + local->op_errno = ENOMEM; + afr_changelog_post_op_done (frame, this); + goto out; + } - if (local->op_ret >= 0) - goto out; + } + + if (!nothing_failed) { + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) + local->pending[i][idx] = hton32(1); + } + ret = afr_set_pending_dict (priv, xattr, local->pending); + if (ret < 0) { + local->op_ret = -1; + local->op_errno = ENOMEM; + afr_changelog_post_op_done (frame, this); + goto out; + } + + } - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); + afr_changelog_do (frame, this, xattr, afr_changelog_post_op_done); out: - return; + if (xattr) + dict_unref (xattr); + + return 0; } -static void -afr_data_handle_quota_errors (call_frame_t *frame, xlator_t *this) + +gf_boolean_t +afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this) { - int i = 0; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - gf_boolean_t all_quota_failures = _gf_false; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + fd_t *fd = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + afr_fd_ctx_t *fd_ctx = NULL; + int type = 0; - local = frame->local; - priv = this->private; - if (local->transaction.type != AFR_DATA_TRANSACTION) - return; - /* - * Idea is to not leave the file in FOOL-FOOL scenario in case on - * all the bricks data transaction failed with EDQUOT to avoid - * increasing un-necessary load of self-heals in the system. - */ - all_quota_failures = _gf_true; - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i] && - (local->child_errno[i] != EDQUOT)) { - all_quota_failures = _gf_false; - break; - } - } - if (all_quota_failures) - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); + local = frame->local; + priv = this->private; + fd = local->fd; + + type = afr_index_for_transaction_type (local->transaction.type); + if (type != AFR_DATA_TRANSACTION) + return !local->transaction.dirtied; + + if (!fd) + return !local->transaction.dirtied; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return _gf_false; + + if (local->transaction.no_uninherit) + return _gf_false; + + /* This function must be idempotent. So check if we + were called before and return the same answer again. + + It is important to keep this function idempotent for + the call in afr_changelog_post_op_safe() to not have + side effects on the call from afr_changelog_post_op_now() + */ + if (local->transaction.uninherit_done) + return local->transaction.uninherit_value; + + LOCK(&fd->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] != + fd_ctx->pre_op_done[type][i]) { + ret = !local->transaction.dirtied; + goto unlock; + } + } + + if (fd_ctx->inherited[type]) { + ret = _gf_true; + fd_ctx->inherited[type]--; + } else if (fd_ctx->on_disk[type]) { + ret = _gf_false; + fd_ctx->on_disk[type]--; + } else { + /* ASSERT */ + ret = _gf_false; + } + + if (!fd_ctx->inherited[type] && !fd_ctx->on_disk[type]) { + for (i = 0; i < priv->child_count; i++) + fd_ctx->pre_op_done[type][i] = 0; + } + } +unlock: + UNLOCK(&fd->lock); + + local->transaction.uninherit_done = _gf_true; + local->transaction.uninherit_value = ret; + + return ret; } -int -afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) + +gf_boolean_t +afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = this->private; - afr_internal_lock_t *int_lock = NULL; - int i = 0; - int call_count = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + fd_t *fd = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + afr_fd_ctx_t *fd_ctx = NULL; + int type = 0; - afr_local_t * local = NULL; - afr_fd_ctx_t *fdctx = NULL; - dict_t **xattr = NULL; - int piggyback = 0; - int nothing_failed = 1; + local = frame->local; + priv = this->private; + fd = local->fd; - local = frame->local; - int_lock = &local->internal_lock; + if (local->transaction.type != AFR_DATA_TRANSACTION) + return _gf_false; - __mark_non_participant_children (local->pending, priv->child_count, - local->transaction.pre_op, - local->transaction.type); + type = afr_index_for_transaction_type (local->transaction.type); - afr_data_handle_quota_errors (frame, this); - afr_dir_fop_handle_all_fop_failures (frame); + if (!fd) + return _gf_false; - if (local->fd) - afr_transaction_rm_stale_children (frame, this, - local->fd->inode, - local->transaction.type); + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return _gf_false; - xattr = alloca (priv->child_count * sizeof (*xattr)); - memset (xattr, 0, (priv->child_count * sizeof (*xattr))); - for (i = 0; i < priv->child_count; i++) { - xattr[i] = dict_new (); - } + LOCK(&fd->lock); + { + if (!fd_ctx->on_disk[type]) { + /* nothing to inherit yet */ + ret = _gf_false; + goto unlock; + } - call_count = afr_changelog_post_op_call_count (local->transaction.type, - local->transaction.pre_op, - priv->child_count); - local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] != + fd_ctx->pre_op_done[type][i]) { + /* either inherit exactly, or don't */ + ret = _gf_false; + goto unlock; + } + } - if (local->fd) - fdctx = afr_fd_ctx_get (local->fd, this); + fd_ctx->inherited[type]++; - if (call_count == 0) { - /* no child is up */ - int_lock->lock_cbk = local->transaction.done; - afr_unlock (frame, this); - goto out; - } + ret = _gf_true; - nothing_failed = afr_txn_nothing_failed (frame, this); + local->transaction.inherited = _gf_true; + } +unlock: + UNLOCK(&fd->lock); - afr_compute_txn_changelog (local , priv); + return ret; +} - for (i = 0; i < priv->child_count; i++) { - if (!local->transaction.pre_op[i]) - continue; - if (local->transaction.type != AFR_DATA_TRANSACTION) - afr_set_postop_dict (local, this, xattr[i], - local->optimistic_change_log, i); - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - { - if (!fdctx) { - afr_set_postop_dict (local, this, xattr[i], - 0, i); - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - break; - } +gf_boolean_t +afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + fd_t *fd = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + int type = 0; - /* local->transaction.postop_piggybacked[] was - precomputed in is_piggyback_postop() when called from - afr_changelog_post_op_safe() - */ + local = frame->local; + priv = this->private; + fd = local->fd; - piggyback = 0; - if (local->transaction.postop_piggybacked[i]) - piggyback = 1; + if (!fd) + return _gf_false; - afr_set_postop_dict (local, this, xattr[i], - piggyback, i); + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return _gf_false; - if (nothing_failed && piggyback) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], NULL); - } else { - STACK_WIND_COOKIE (frame, - afr_changelog_post_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - } - break; - case AFR_METADATA_TRANSACTION: - { - if (nothing_failed && local->optimistic_change_log) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - break; - } + if (local->transaction.inherited) + /* was already inherited in afr_changelog_pre_op */ + return _gf_false; - if (local->fd) - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - else - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - break; + if (!local->transaction.dirtied) + return _gf_false; - case AFR_ENTRY_RENAME_TRANSACTION: - { - if (nothing_failed && local->optimistic_change_log) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - } else { - STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - call_count--; - } + if (!afr_txn_nothing_failed (frame, this)) + return _gf_false; - /* - set it again because previous stack_wind - might have already returned (think of case - where subvolume is posix) and would have - used the dict as placeholder for return - value - */ + type = afr_index_for_transaction_type (local->transaction.type); - afr_set_postop_dict (local, this, xattr[i], - local->optimistic_change_log, i); + ret = _gf_false; - /* fall through */ + LOCK(&fd->lock); + { + if (!fd_ctx->on_disk[type]) { + for (i = 0; i < priv->child_count; i++) + fd_ctx->pre_op_done[type][i] = + local->transaction.pre_op[i]; + } else { + for (i = 0; i < priv->child_count; i++) + if (fd_ctx->pre_op_done[type][i] != + local->transaction.pre_op[i]) { + local->transaction.no_uninherit = 1; + goto unlock; + } + } + fd_ctx->on_disk[type]++; + + ret = _gf_true; + } +unlock: + UNLOCK(&fd->lock); - case AFR_ENTRY_TRANSACTION: - { - if (nothing_failed && local->optimistic_change_log) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - break; - } + return ret; +} - if (local->fd) - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - else - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - break; - } - if (!--call_count) - break; - } +int +afr_changelog_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + afr_local_t *local = NULL; + int call_count = -1; -out: - for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); - } + local = frame->local; + + if (op_ret == -1) + afr_transaction_fop_failed (frame, this, (long) cookie); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + local->transaction.changelog_resume (frame, this); return 0; } -int32_t -afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr, - dict_t *xdata) +int +afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, + afr_changelog_resume_t changelog_resume) { - afr_local_t * local = NULL; - afr_private_t * priv = this->private; - int call_count = -1; - int child_index = (long) cookie; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; - local = frame->local; + local = frame->local; + priv = this->private; - LOCK (&frame->lock); - { - switch (op_ret) { - case 0: - __mark_pre_op_done_on_fd (frame, this, child_index); - //fallthrough we need to mark the pre_op - case 1: - local->transaction.pre_op[child_index] = 1; - /* special op_ret for piggyback */ - break; - case -1: - if (op_errno == ENOTSUP) { - gf_log (this->name, GF_LOG_ERROR, - "xattrop not supported by %s", - priv->children[child_index]->name); - local->op_ret = -1; - - } else if (!child_went_down (op_ret, op_errno)) { - gf_log (this->name, GF_LOG_ERROR, - "xattrop failed on child %s: %s", - priv->children[child_index]->name, - strerror (op_errno)); + call_count = afr_changelog_call_count (local->transaction.type, + local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + changelog_resume (frame, this); + return 0; + } + + local->call_count = call_count; + + local->transaction.changelog_resume = changelog_resume; + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + if (!local->fd) { + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); + } else { + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); } - local->op_errno = op_errno; - break; - } + break; + case AFR_ENTRY_RENAME_TRANSACTION: - call_count = --local->call_count; - } - UNLOCK (&frame->lock); + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.new_parent_loc, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); + call_count--; - if (call_count == 0) { - if ((local->op_ret == -1) && - (local->op_errno == ENOTSUP)) { - local->transaction.resume (frame, this); - } else { - afr_transaction_perform_fop (frame, this); - } + /* fall through */ + + case AFR_ENTRY_TRANSACTION: + if (local->fd) + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); + else + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.parent_loc, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); + break; + } + + if (!--call_count) + break; } - return 0; + return 0; } + int afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) { @@ -902,206 +816,122 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) int i = 0; int ret = 0; int call_count = 0; - dict_t **xattr = NULL; - afr_fd_ctx_t *fdctx = NULL; + int op_errno = 0; afr_local_t *local = NULL; - int piggyback = 0; afr_internal_lock_t *int_lock = NULL; unsigned char *locked_nodes = NULL; + unsigned char *pending_subvols = NULL; + int idx = -1; + gf_boolean_t pre_nop = _gf_true; + dict_t *xdata_req = NULL; local = frame->local; int_lock = &local->internal_lock; - - xattr = alloca (priv->child_count * sizeof (*xattr)); - memset (xattr, 0, (priv->child_count * sizeof (*xattr))); - - for (i = 0; i < priv->child_count; i++) { - xattr[i] = dict_new (); - } - - call_count = afr_changelog_pre_op_call_count (local->transaction.type, - int_lock, - priv->child_count); - if (call_count == 0) { - local->internal_lock.lock_cbk = - local->transaction.done; - afr_unlock (frame, this); - goto out; - } - - local->call_count = call_count; - - __mark_all_pending (local->pending, priv->child_count, - local->transaction.type); - - if (local->fd) - fdctx = afr_fd_ctx_get (local->fd, this); + idx = afr_index_for_transaction_type (local->transaction.type); locked_nodes = afr_locked_nodes_get (local->transaction.type, int_lock); - for (i = 0; i < priv->child_count; i++) { - if (!locked_nodes[i]) - continue; - ret = afr_set_pending_dict (priv, xattr[i], local->pending, - i, LOCAL_FIRST); - - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); + pending_subvols = alloca0 (priv->child_count); - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - { - if (!fdctx) { - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &(local->loc), - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - break; - } + for (i = 0; i < priv->child_count; i++) { + if (locked_nodes[i]) { + local->transaction.pre_op[i] = 1; + call_count++; + } else { + pending_subvols[i] = 1; + } + } - LOCK (&local->fd->lock); - { - piggyback = 0; - if (fdctx->pre_op_done[i]) { - fdctx->pre_op_piggyback[i]++; - piggyback = 1; - fdctx->hit++; - } else { - fdctx->miss++; - } - } - UNLOCK (&local->fd->lock); + /* TBD: quorum check w/ call_count */ - afr_set_delayed_post_op (frame, this); + if (call_count == 0) { + op_errno = ENOTCONN; + goto err; + } - if (piggyback) - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - break; - case AFR_METADATA_TRANSACTION: - { - if (local->optimistic_change_log) { - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - break; - } + xdata_req = dict_new(); + if (!xdata_req) { + op_errno = ENOMEM; + goto err; + } - if (local->fd) - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &(local->loc), - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - break; + pre_nop = _gf_true; + + if (afr_changelog_pre_op_inherit (frame, this)) + goto next; + + if (call_count < priv->child_count) { + /* For subvols we are not performing operation on, + mark them as pending up-front along with the FOP + so that we can safely defer unmarking dirty until + later. + */ + for (i = 0; i < priv->child_count; i++) { + if (pending_subvols[i]) + local->pending[i][idx] = hton32(1); + } + ret = afr_set_pending_dict (priv, xdata_req, + local->pending); + if (ret < 0) { + op_errno = ENOMEM; + goto err; + } + pre_nop = _gf_false; + } - case AFR_ENTRY_RENAME_TRANSACTION: - { - if (local->optimistic_change_log) { - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - } else { - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } + if (call_count > 1 && + (local->transaction.type == AFR_DATA_TRANSACTION || + !local->optimistic_change_log)) { + + /* If we are performing change on only one subvol, no + need to mark dirty, because we are setting the pending + counts already anyways + */ + local->dirty[idx] = hton32(1); + + ret = dict_set_static_bin (xdata_req, AFR_DIRTY, local->dirty, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + op_errno = ENOMEM; + goto err; + } + + pre_nop = _gf_false; + local->transaction.dirtied = 1; + } - call_count--; - } + if (pre_nop) + goto next; + if (!local->pre_op_compat) { + dict_copy (xdata_req, local->xdata_req); + goto next; + } - /* - set it again because previous stack_wind - might have already returned (think of case - where subvolume is posix) and would have - used the dict as placeholder for return - value - */ + afr_changelog_do (frame, this, xdata_req, afr_transaction_perform_fop); - ret = afr_set_pending_dict (priv, xattr[i], local->pending, - i, LOCAL_FIRST); + if (xdata_req) + dict_unref (xdata_req); - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); + return 0; +next: + afr_transaction_perform_fop (frame, this); - /* fall through */ + if (xdata_req) + dict_unref (xdata_req); - case AFR_ENTRY_TRANSACTION: - { - if (local->optimistic_change_log) { - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - break; - } + return 0; +err: + local->internal_lock.lock_cbk = local->transaction.done; + local->op_ret = -1; + local->op_errno = op_errno; - if (local->fd) - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - break; - } + afr_unlock (frame, this); - if (!--call_count) - break; - } -out: - for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); - } + if (xdata_req) + dict_unref (xdata_req); - return 0; + return 0; } @@ -1365,15 +1195,15 @@ afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this) } gf_boolean_t -afr_are_multiple_fds_opened (inode_t *inode, xlator_t *this) +afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this) { - afr_inode_ctx_t *ictx = NULL; + afr_fd_ctx_t *fd_ctx = NULL; - if (!inode) { + if (!fd) { /* If false is returned, it may keep on taking eager-lock * which may lead to starvation, so return true to avoid that. */ - gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid inode"); + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid fd"); return _gf_true; } /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock @@ -1383,32 +1213,22 @@ afr_are_multiple_fds_opened (inode_t *inode, xlator_t *this) * if open-fd-count is > 1 */ - ictx = afr_inode_ctx_get (inode, this); - if (!ictx) + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) return _gf_true; - if (ictx->open_fd_count > 1) + if (fd_ctx->open_fd_count > 1) return _gf_true; return _gf_false; } -gf_boolean_t -afr_any_fops_failed (afr_local_t *local, afr_private_t *priv) -{ - if (local->success_count != priv->child_count) - return _gf_true; - return _gf_false; -} gf_boolean_t is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; gf_boolean_t res = _gf_false; - afr_private_t *priv = NULL; - - priv = this->private; local = frame->local; if (!local) @@ -1418,10 +1238,10 @@ is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this) goto out; //Mark pending changelog ASAP - if (afr_any_fops_failed (local, priv)) + if (!afr_txn_nothing_failed (frame, this)) goto out; - if (local->fd && afr_are_multiple_fds_opened (local->fd->inode, this)) + if (local->fd && afr_are_multiple_fds_opened (local->fd, this)) goto out; res = _gf_true; @@ -1445,58 +1265,6 @@ afr_delayed_changelog_wake_up_cbk (void *data) } -/* - Check if the frame is destined to get optimized away - with changelog piggybacking -*/ -static gf_boolean_t -is_piggyback_post_op (call_frame_t *frame, fd_t *fd) -{ - afr_fd_ctx_t *fdctx = NULL; - afr_local_t *local = NULL; - gf_boolean_t piggyback = _gf_true; - afr_private_t *priv = NULL; - int i = 0; - - priv = frame->this->private; - local = frame->local; - fdctx = afr_fd_ctx_get (fd, frame->this); - - LOCK(&fd->lock); - { - piggyback = _gf_true; - - for (i = 0; i < priv->child_count; i++) { - if (!local->transaction.pre_op[i]) - continue; - if (fdctx->pre_op_piggyback[i]) { - fdctx->pre_op_piggyback[i]--; - local->transaction.postop_piggybacked[i] = 1; - } else { - /* For at least _one_ subvolume we cannot - piggyback on the changelog, and have to - perform a hard POST-OP and therefore fsync - if necesssary - */ - piggyback = _gf_false; - GF_ASSERT (fdctx->pre_op_done[i]); - fdctx->pre_op_done[i]--; - } - } - } - UNLOCK(&fd->lock); - - if (!afr_txn_nothing_failed (frame, frame->this)) { - /* something failed in this transaction, - we will be performing a hard post-op - */ - return _gf_false; - } - - return piggyback; -} - - /* SET operation */ int afr_fd_report_unstable_write (xlator_t *this, fd_t *fd) @@ -1521,7 +1289,7 @@ afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd) afr_fd_ctx_t *fdctx = NULL; gf_boolean_t witness = _gf_false; - fdctx = afr_fd_ctx_get (fd, this); + fdctx = afr_fd_ctx_get (fd, this); if (!fdctx) return _gf_true; @@ -1551,10 +1319,10 @@ afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, priv = this->private; local = frame->local; - if (afr_fop_failed (op_ret, op_errno)) { + if (op_ret != 0) { /* Failure of fsync() is as good as failure of previous write(). So treat it like one. - */ + */ gf_log (this->name, GF_LOG_WARNING, "fsync(%s) failed on subvolume %s. Transaction was %s", uuid_utoa (local->fd->inode->gfid), @@ -1562,14 +1330,14 @@ afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_fop_list[local->op]); afr_transaction_fop_failed (frame, this, child_index); - } + } - call_count = afr_frame_return (frame); + call_count = afr_frame_return (frame); - if (call_count == 0) - afr_changelog_post_op_now (frame, this); + if (call_count == 0) + afr_changelog_post_op_now (frame, this); - return 0; + return 0; } @@ -1580,14 +1348,13 @@ afr_changelog_fsync (call_frame_t *frame, xlator_t *this) int i = 0; int call_count = 0; afr_private_t *priv = NULL; - dict_t *xdata = NULL; - GF_UNUSED int ret = -1; + dict_t *xdata = NULL; + GF_UNUSED int ret = -1; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); + call_count = AFR_COUNT (local->transaction.pre_op, priv->child_count); if (!call_count) { /* will go straight to unlock */ @@ -1597,30 +1364,30 @@ afr_changelog_fsync (call_frame_t *frame, xlator_t *this) local->call_count = call_count; - xdata = dict_new(); - if (xdata) - ret = dict_set_int32 (xdata, "batch-fsync", 1); + xdata = dict_new(); + if (xdata) + ret = dict_set_int32 (xdata, "batch-fsync", 1); for (i = 0; i < priv->child_count; i++) { if (!local->transaction.pre_op[i]) continue; STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->fsync, local->fd, - 1, xdata); + (void *) (long) i, priv->children[i], + priv->children[i]->fops->fsync, local->fd, + 1, xdata); if (!--call_count) break; } - if (xdata) - dict_unref (xdata); + if (xdata) + dict_unref (xdata); return 0; } - int +int afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; @@ -1634,7 +1401,8 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) return 0; } - if (is_piggyback_post_op (frame, local->fd)) { + if (afr_changelog_pre_op_uninherit (frame, this) && + afr_txn_nothing_failed (frame, this)) { /* just detected that this post-op is about to be optimized away as a new write() has already piggybacked on this frame's changelog. @@ -1733,7 +1501,7 @@ out: if (prev_frame) { local = prev_frame->local; local->transaction.resume_stub = stub; - afr_changelog_post_op_safe (prev_frame, this); + afr_changelog_post_op_now (prev_frame, this); } else if (stub) { call_resume (stub); } @@ -1779,13 +1547,9 @@ afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd) int afr_transaction_resume (call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; - afr_private_t *priv = NULL; local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; if (local->transaction.eager_lock_on) { /* We don't need to retain "local" in the @@ -1800,15 +1564,17 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this) afr_restore_lk_owner (frame); + afr_handle_symmetric_errors (frame, this); + + if (!local->pre_op_compat) + /* new mode, pre-op was done along + with OP */ + afr_changelog_pre_op_update (frame, this); + if (__fop_changelog_needed (frame, this)) { afr_changelog_post_op (frame, this); } else { - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - local->transaction.done (frame, this); - } else { - int_lock->lock_cbk = local->transaction.done; - afr_unlock (frame, this); - } + afr_changelog_post_op_done (frame, this); } return 0; @@ -1824,13 +1590,10 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index) { afr_local_t * local = NULL; - afr_private_t * priv = NULL; local = frame->local; - priv = this->private; - __mark_child_dead (local->pending, priv->child_count, - child_index, local->transaction.type); + local->transaction.failed_subvols[child_index] = 1; } @@ -1878,7 +1641,7 @@ afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this) if (!fdctx) return; - if (afr_are_multiple_fds_opened (local->fd->inode, this)) + if (afr_are_multiple_fds_opened (local->fd, this)) return; /* * Once full file lock is acquired in eager-lock phase, overlapping diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index fa626fd0d6e..77cc8eed019 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -11,10 +11,7 @@ #ifndef __TRANSACTION_H__ #define __TRANSACTION_H__ -typedef enum { - LOCAL_FIRST = 1, - LOCAL_LAST = 2 -} afr_xattrop_type_t; +#include "afr.h" void afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, @@ -29,11 +26,9 @@ afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom); int32_t afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); -afr_fd_ctx_t * -afr_fd_ctx_get (fd_t *fd, xlator_t *this); int -afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, - int child, afr_xattrop_type_t op); +afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending); + void afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this); @@ -41,11 +36,18 @@ void afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd); void -__mark_all_success (int32_t *pending[], int child_count, - afr_transaction_type type); -gf_boolean_t -afr_any_fops_failed (afr_local_t *local, afr_private_t *priv); +__mark_all_success (call_frame_t *frame, xlator_t *this); gf_boolean_t afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this); + +int afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_read_txn_wind_t readfn, afr_transaction_type type); + +int afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol); + +int __afr_txn_write_fop (call_frame_t *frame, xlator_t *this); +int __afr_txn_write_done (call_frame_t *frame, xlator_t *this); +call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame); + #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index c264538073e..5e12910b7c6 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -21,11 +21,6 @@ #endif #include "afr-common.c" -#define SHD_INODE_LRU_LIMIT 2048 -#define AFR_EH_HEALED_LIMIT 1024 -#define AFR_EH_HEAL_FAIL_LIMIT 1024 -#define AFR_EH_SPLIT_BRAIN_LIMIT 1024 - struct volume_options options[]; int32_t @@ -114,6 +109,14 @@ reconfigure (xlator_t *this, dict_t *options) priv = this->private; + GF_OPTION_RECONF ("afr-dirty-xattr", + priv->afr_dirty, options, str, + out); + + GF_OPTION_RECONF ("metadata-splitbrain-forced-heal", + priv->metadata_splitbrain_forced_heal, options, bool, + out); + GF_OPTION_RECONF ("background-self-heal-count", priv->background_self_heal_count, options, uint32, out); @@ -127,9 +130,6 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("entry-self-heal", priv->entry_self_heal, options, bool, out); - GF_OPTION_RECONF ("strict-readdir", priv->strict_readdir, options, bool, - out); - GF_OPTION_RECONF ("data-self-heal-window-size", priv->data_self_heal_window_size, options, uint32, out); @@ -146,8 +146,6 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("data-self-heal-algorithm", priv->data_self_heal_algorithm, options, str, out); - GF_OPTION_RECONF ("self-heal-daemon", priv->shd.enabled, options, bool, out); - GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out); GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode, @@ -175,13 +173,13 @@ reconfigure (xlator_t *this, dict_t *options) priv->read_child = index; } + GF_OPTION_RECONF ("pre-op-compat", priv->pre_op_compat, options, bool, out); + GF_OPTION_RECONF ("eager-lock", priv->eager_lock, options, bool, out); GF_OPTION_RECONF ("quorum-type", qtype, options, str, out); GF_OPTION_RECONF ("quorum-count", priv->quorum_count, options, uint32, out); fix_quorum_options(this,priv,qtype); - GF_OPTION_RECONF ("heal-timeout", priv->shd.timeout, options, - int32, out); GF_OPTION_RECONF ("post-op-delay-secs", priv->post_op_delay_secs, options, uint32, out); @@ -189,10 +187,15 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, options, size, out); /* Reset this so we re-discover in case the topology changed. */ - GF_OPTION_RECONF ("readdir-failover", priv->readdir_failover, options, - bool, out); GF_OPTION_RECONF ("ensure-durability", priv->ensure_durability, options, bool, out); + + GF_OPTION_RECONF ("self-heal-daemon", priv->shd.enabled, options, + bool, out); + + GF_OPTION_RECONF ("iam-self-heal-daemon", priv->shd.iamshd, options, + bool, out); + priv->did_discovery = _gf_false; ret = 0; @@ -244,10 +247,6 @@ init (xlator_t *this) priv = this->private; LOCK_INIT (&priv->lock); - LOCK_INIT (&priv->read_child_lock); - //lock recovery is not done in afr - pthread_mutex_init (&priv->mutex, NULL); - INIT_LIST_HEAD (&priv->saved_fds); child_count = xlator_subvolume_count (this); @@ -255,6 +254,11 @@ init (xlator_t *this) priv->read_child = -1; + GF_OPTION_INIT ("afr-dirty-xattr", priv->afr_dirty, str, out); + + GF_OPTION_INIT ("metadata-splitbrain-forced-heal", + priv->metadata_splitbrain_forced_heal, bool, out); + GF_OPTION_INIT ("read-subvolume", read_subvol, xlator, out); if (read_subvol) { priv->read_child = xlator_subvolume_index (this, read_subvol); @@ -308,10 +312,6 @@ init (xlator_t *this) GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out); - GF_OPTION_INIT ("self-heal-daemon", priv->shd.enabled, bool, out); - - GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out); - GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out); GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool, @@ -326,7 +326,7 @@ init (xlator_t *this) GF_OPTION_INIT ("entrylk-trace", priv->entrylk_trace, bool, out); - GF_OPTION_INIT ("strict-readdir", priv->strict_readdir, bool, out); + GF_OPTION_INIT ("pre-op-compat", priv->pre_op_compat, bool, out); GF_OPTION_INIT ("eager-lock", priv->eager_lock, bool, out); GF_OPTION_INIT ("quorum-type", qtype, str, out); @@ -336,10 +336,13 @@ init (xlator_t *this) fix_quorum_options(this,priv,qtype); GF_OPTION_INIT ("post-op-delay-secs", priv->post_op_delay_secs, uint32, out); - GF_OPTION_INIT ("readdir-failover", priv->readdir_failover, bool, out); GF_OPTION_INIT ("ensure-durability", priv->ensure_durability, bool, out); + GF_OPTION_INIT ("self-heal-daemon", priv->shd.enabled, bool, out); + + GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out); + priv->wait_count = 1; priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, @@ -402,6 +405,12 @@ init (xlator_t *this) goto out; } + ret = afr_selfheal_daemon_init (this); + if (ret) { + ret = -ENOMEM; + goto out; + } + /* keep more local here as we may need them for self-heal etc */ this->local_pool = mem_pool_new (afr_local_t, 512); if (!this->local_pool) { @@ -411,58 +420,8 @@ init (xlator_t *this) goto out; } - priv->first_lookup = 1; priv->root_inode = NULL; - if (!priv->shd.iamshd) { - ret = 0; - goto out; - } - - ret = -ENOMEM; - priv->shd.pos = GF_CALLOC (sizeof (*priv->shd.pos), child_count, - gf_afr_mt_brick_pos_t); - if (!priv->shd.pos) - goto out; - - priv->shd.pending = GF_CALLOC (sizeof (*priv->shd.pending), child_count, - gf_afr_mt_int32_t); - if (!priv->shd.pending) - goto out; - - priv->shd.inprogress = GF_CALLOC (sizeof (*priv->shd.inprogress), - child_count, gf_afr_mt_shd_bool_t); - if (!priv->shd.inprogress) - goto out; - priv->shd.timer = GF_CALLOC (sizeof (*priv->shd.timer), child_count, - gf_afr_mt_shd_timer_t); - if (!priv->shd.timer) - goto out; - - priv->shd.healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false, - _destroy_shd_event_data); - if (!priv->shd.healed) - goto out; - - priv->shd.heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false, - _destroy_shd_event_data); - if (!priv->shd.heal_failed) - goto out; - - priv->shd.split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false, - _destroy_shd_event_data); - if (!priv->shd.split_brain) - goto out; - - this->itable = inode_table_new (SHD_INODE_LRU_LIMIT, this); - if (!this->itable) - goto out; - priv->root_inode = inode_ref (this->itable->root); - GF_OPTION_INIT ("node-uuid", priv->shd.node_uuid, str, out); - GF_OPTION_INIT ("heal-timeout", priv->shd.timeout, int32, out); - ret = afr_initialise_statistics (this); - if (ret) - goto out; ret = 0; out: return ret; @@ -572,11 +531,11 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_INT, .min = 0, .max = 2, - .default_value = "0", + .default_value = "1", .description = "inode-read fops happen only on one of the bricks in " "replicate. AFR will prefer the one computed using " "the method specified using this option" - "0 = first responder, " + "0 = first up server, " "1 = hash by GFID of file (all clients use " "same subvolume), " "2 = hash by GFID of file and client PID", @@ -585,7 +544,7 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_BOOL, .default_value = "true", .description = "Choose a local subvolume (i.e. Brick) to read from" - " if read-subvolume is not explicitly set.", + " if read-subvolume is not explicitly set.", }, { .key = {"favorite-child"}, .type = GF_OPTION_TYPE_XLATOR, @@ -675,10 +634,6 @@ struct volume_options options[] = { "pre fop changelog operations in afr transaction " "if this option is enabled." }, - { .key = {"strict-readdir"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - }, { .key = {"inodelk-trace"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", @@ -689,6 +644,12 @@ struct volume_options options[] = { .default_value = "off", .description = "Enabling this option logs entry lock/unlocks" }, + { .key = {"pre-op-compat"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Use separate pre-op xattrop() FOP rather than " + "overloading xdata of the OP" + }, { .key = {"eager-lock"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", @@ -753,14 +714,6 @@ struct volume_options options[] = { "self-heal-daemon so that it can crawl only on " "local index directories.", }, - { .key = {"heal-timeout"}, - .type = GF_OPTION_TYPE_INT, - .min = 60, - .max = INT_MAX, - .default_value = "600", - .description = "time interval for checking the need to self-heal " - "in self-heal-daemon" - }, { .key = {"post-op-delay-secs"}, .type = GF_OPTION_TYPE_INT, .min = 0, @@ -777,11 +730,6 @@ struct volume_options options[] = { .max = 131072, .default_value = "1KB", }, - { .key = {"readdir-failover"}, - .type = GF_OPTION_TYPE_BOOL, - .description = "readdir(p) will not failover if this option is off", - .default_value = "on", - }, { .key = {"ensure-durability"}, .type = GF_OPTION_TYPE_BOOL, .description = "Afr performs fsyncs for transactions if this " @@ -789,5 +737,13 @@ struct volume_options options[] = { "written to the disk", .default_value = "on", }, + { .key = {"afr-dirty-xattr"}, + .type = GF_OPTION_TYPE_STR, + .default_value = AFR_DIRTY_DEFAULT, + }, + { .key = {"metadata-splitbrain-forced-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, { .key = {NULL} }, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 9196a1f271d..2e1b78d1c9f 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -20,112 +20,42 @@ #include "call-stub.h" #include "compat-errno.h" #include "afr-mem-types.h" -#include "afr-self-heal-algorithm.h" #include "libxlator.h" #include "timer.h" +#include "syncop.h" + +#include "afr-self-heald.h" #define AFR_XATTR_PREFIX "trusted.afr" #define AFR_PATHINFO_HEADER "REPLICATE:" #define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size" #define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal" +#define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty" +#define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty) #define AFR_LOCKEE_COUNT_MAX 3 #define AFR_DOM_COUNT_MAX 3 - -#define afr_inode_missing(op_errno) (op_errno == ENOENT || op_errno == ESTALE) - -struct _pump_private; - -typedef int (*afr_expunge_done_cbk_t) (call_frame_t *frame, xlator_t *this, - int child, int32_t op_error, - int32_t op_errno); - -typedef int (*afr_impunge_done_cbk_t) (call_frame_t *frame, xlator_t *this, - int32_t op_error, int32_t op_errno); -typedef int (*afr_post_remove_call_t) (call_frame_t *frame, xlator_t *this); +#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this); -typedef void (*afr_lookup_done_cbk_t) (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno); -typedef enum { - AFR_POS_UNKNOWN, - AFR_POS_LOCAL, - AFR_POS_REMOTE -} afr_child_pos_t; +typedef int (*afr_read_txn_wind_t) (call_frame_t *frame, xlator_t *this, int subvol); -typedef enum { - SPLIT_BRAIN = 1, - ALL_FOOLS = 2 -} afr_subvol_status_t; +typedef int (*afr_inode_refresh_cbk_t) (call_frame_t *frame, xlator_t *this, int err); -typedef enum { - AFR_INODE_SET_READ_CTX = 1, - AFR_INODE_RM_STALE_CHILDREN, - AFR_INODE_SET_OPENDIR_DONE, - AFR_INODE_GET_READ_CTX, - AFR_INODE_GET_OPENDIR_DONE, -} afr_inode_op_t; - -typedef struct afr_inode_params_ { - afr_inode_op_t op; - union { - gf_boolean_t value; - struct { - int32_t read_child; - int32_t *children; - } read_ctx; - } u; -} afr_inode_params_t; - -typedef enum afr_spb_state { - DONT_KNOW, - SPB, - NO_SPB -} afr_spb_state_t; - -typedef struct afr_inode_ctx_ { - uint64_t masks; - int32_t *fresh_children;//increasing order of latency - afr_spb_state_t mdata_spb; - afr_spb_state_t data_spb; - uint32_t open_fd_count; -} afr_inode_ctx_t; +typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this); -typedef enum { - NONE, - INDEX, - INDEX_TO_BE_HEALED, - FULL, -} afr_crawl_type_t; - -typedef struct afr_self_heald_ { - gf_boolean_t enabled; - gf_boolean_t iamshd; - afr_crawl_type_t *pending; - gf_boolean_t *inprogress; - afr_child_pos_t *pos; - gf_timer_t **timer; - eh_t *healed; - eh_t *heal_failed; - eh_t *split_brain; - eh_t **statistics; - void **crawl_events; - char *node_uuid; - int timeout; -} afr_self_heald_t; +#define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr;}) +#define AFR_COUNT(array,max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res;}) +#define AFR_INTERSECT(dst,src1,src2,max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i];}) typedef struct _afr_private { gf_lock_t lock; /* to guard access to child_count, etc */ unsigned int child_count; /* total number of children */ - unsigned int read_child_rr; /* round-robin index of the read_child */ - gf_lock_t read_child_lock; /* lock to protect above */ - xlator_t **children; - int first_lookup; inode_t *root_inode; unsigned char *child_up; @@ -146,6 +76,7 @@ typedef struct _afr_private { gf_boolean_t metadata_change_log; /* on/off */ gf_boolean_t entry_change_log; /* on/off */ + gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */ int read_child; /* read-subvolume */ unsigned int hash_mode; /* for when read_child is not set */ int favorite_child; /* subvolume to be preferred in resolving @@ -154,178 +85,45 @@ typedef struct _afr_private { gf_boolean_t inodelk_trace; gf_boolean_t entrylk_trace; - gf_boolean_t strict_readdir; - unsigned int wait_count; /* # of servers to wait for success */ uint64_t up_count; /* number of CHILD_UPs we have seen */ uint64_t down_count; /* number of CHILD_DOWNs we have seen */ - struct _pump_private *pump_private; /* Set if we are loaded as pump */ - int use_afr_in_pump; - - pthread_mutex_t mutex; - struct list_head saved_fds; /* list of fds on which locks have succeeded */ gf_boolean_t optimistic_change_log; gf_boolean_t eager_lock; + gf_boolean_t pre_op_compat; /* on/off */ uint32_t post_op_delay_secs; unsigned int quorum_count; char vol_uuid[UUID_SIZE + 1]; int32_t *last_event; - afr_self_heald_t shd; + + /* @event_generation: Keeps count of number of events received which can + potentially impact consistency decisions. The events are CHILD_UP + and CHILD_DOWN, when we have to recalculate the freshness/staleness + of copies to detect if changes had happened while the other server + was down. CHILD_DOWN and CHILD_UP can also be received on network + disconnect/reconnects and not necessarily server going down/up. + Recalculating freshness/staleness on network events is equally + important as we might have had a network split brain. + */ + uint32_t event_generation; + gf_boolean_t choose_local; gf_boolean_t did_discovery; - gf_boolean_t readdir_failover; uint64_t sh_readdir_size; gf_boolean_t ensure_durability; char *sh_domain; -} afr_private_t; - -typedef enum { - AFR_SELF_HEAL_NOT_ATTEMPTED, - AFR_SELF_HEAL_STARTED, - AFR_SELF_HEAL_FAILED, - AFR_SELF_HEAL_SYNC_BEGIN, -} afr_self_heal_status; - -typedef struct { - afr_self_heal_status gfid_or_missing_entry_self_heal; - afr_self_heal_status metadata_self_heal; - afr_self_heal_status data_self_heal; - afr_self_heal_status entry_self_heal; -} afr_sh_status_for_all_type; - -typedef enum { - AFR_SELF_HEAL_ENTRY, - AFR_SELF_HEAL_METADATA, - AFR_SELF_HEAL_DATA, - AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY, - AFR_SELF_HEAL_INVALID = -1, -} afr_self_heal_type; - -typedef enum { - AFR_CHECK_ALL, - AFR_CHECK_SPECIFIC, -} afr_sh_fail_check_type; - -struct afr_self_heal_ { - /* External interface: These are variables (some optional) that - are set by whoever has triggered self-heal */ - - gf_boolean_t do_data_self_heal; - gf_boolean_t do_metadata_self_heal; - gf_boolean_t do_entry_self_heal; - gf_boolean_t do_gfid_self_heal; - gf_boolean_t do_missing_entry_self_heal; - gf_boolean_t force_confirm_spb; /* Check for split-brains even when - self-heal is turned off */ - - gf_boolean_t forced_merge; /* Is this a self-heal triggered to - forcibly merge the directories? */ - - gf_boolean_t background; /* do self-heal in background - if possible */ - ia_type_t type; /* st_mode of the entry we're doing - self-heal on */ - inode_t *inode; /* inode on which the self-heal is - performed on */ - uuid_t sh_gfid_req; /* gfid self-heal needs to be done - with this gfid if it is not null */ - - /* Function to call to unwind. If self-heal is being done in the - background, this function will be called as soon as possible. */ - - int (*unwind) (call_frame_t *frame, xlator_t *this, int32_t op_ret, - int32_t op_errno, int32_t sh_failed); - - /* End of external interface members */ - - - /* array of stat's, one for each child */ - struct iatt *buf; - struct iatt *parentbufs; - struct iatt parentbuf; - struct iatt entrybuf; - - afr_expunge_done_cbk_t expunge_done; - afr_impunge_done_cbk_t impunge_done; - - /* array of xattr's, one for each child */ - dict_t **xattr; - - /* array containing if the lookups succeeded in the order of response - */ - int32_t *success_children; - int success_count; - /* array containing the fresh children found in the self-heal process */ - int32_t *fresh_children; - /* array containing the fresh children found in the parent lookup */ - int32_t *fresh_parent_dirs; - /* array of errno's, one for each child */ - int *child_errno; - /*loc used for lookup*/ - loc_t lookup_loc; - int32_t lookup_flags; - afr_lookup_done_cbk_t lookup_done; - - int32_t **pending_matrix; - int32_t **delta_matrix; + char *afr_dirty; - int32_t op_ret; - int32_t op_errno; + afr_self_heald_t shd; - int *sources; - int source; - int active_source; - int active_sinks; - unsigned char *success; - unsigned char *locked_nodes; - int lock_count; - - const char *linkname; - gf_boolean_t entries_skipped; - - gf_boolean_t actual_sh_started; - gf_boolean_t sync_done; - gf_boolean_t data_lock_held; - gf_boolean_t sh_dom_lock_held; - gf_boolean_t eof_reached; - fd_t *healing_fd; - int file_has_holes; - blksize_t block_size; - off_t file_size; - off_t offset; - unsigned char *write_needed; - uint8_t *checksum; - afr_post_remove_call_t post_remove_call; - - char *data_sh_info; - char *metadata_sh_info; - - loc_t parent_loc; - call_frame_t *orig_frame; - call_frame_t *old_loop_frame; - gf_boolean_t unwound; - - afr_sh_algo_private_t *private; - afr_sh_status_for_all_type afr_all_sh_status; - afr_self_heal_type sh_type_in_action; - - struct afr_sh_algorithm *algo; - afr_lock_cbk_t data_lock_success_handler; - afr_lock_cbk_t data_lock_failure_handler; - gf_boolean_t data_lock_block; - int (*completion_cbk) (call_frame_t *frame, xlator_t *this); - int (*sh_data_algo_start) (call_frame_t *frame, xlator_t *this); - int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this); - int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this); - void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this); - - call_frame_t *sh_frame; -}; + /* pump dependencies */ + void *pump_private; + gf_boolean_t use_afr_in_pump; +} afr_private_t; -typedef struct afr_self_heal_ afr_self_heal_t; typedef enum { AFR_DATA_TRANSACTION, /* truncate, write, ... */ @@ -438,32 +236,72 @@ typedef struct { char *domain; /* Domain on which inode/entry lock/unlock in progress.*/ } afr_internal_lock_t; -typedef struct _afr_locked_fd { - fd_t *fd; - struct list_head list; -} afr_locked_fd_t; - struct afr_reply { int valid; int32_t op_ret; int32_t op_errno; + dict_t *xdata; + struct iatt poststat; + struct iatt postparent; + struct iatt prestat; + struct iatt preparent; + struct iatt preparent2; + struct iatt postparent2; + uint8_t checksum[MD5_DIGEST_LENGTH]; }; +typedef enum { + AFR_FD_NOT_OPENED, + AFR_FD_OPENED, + AFR_FD_OPENING +} afr_fd_open_status_t; + +typedef struct { + unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS]; + int inherited[AFR_NUM_CHANGE_LOGS]; + int on_disk[AFR_NUM_CHANGE_LOGS]; + afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ + + unsigned int *lock_piggyback; + unsigned int *lock_acquired; + + int flags; + + /* used for delayed-post-op optimization */ + pthread_mutex_t delay_lock; + gf_timer_t *delay_timer; + call_frame_t *delay_frame; + + /* set if any write on this fd was a non stable write + (i.e, without O_SYNC or O_DSYNC) + */ + gf_boolean_t witnessed_unstable_write; + + /* @open_fd_count: + Number of open FDs queried from the server, as queried through + xdata in FOPs. Currently, used to decide if eager-locking must be + temporarily disabled. + */ + uint32_t open_fd_count; + + + /* list of frames currently in progress */ + struct list_head eager_locked; +} afr_fd_ctx_t; + + typedef struct _afr_local { - int uid; - int gid; + glusterfs_fop_t op; unsigned int call_count; - unsigned int success_count; - unsigned int enoent_count; - uint32_t open_fd_count; - gf_boolean_t update_open_fd_count; + /* @event_generation: copy of priv->event_generation taken at the + time of starting the transaction. The copy is made so that we + have a stable value through the various phases of the transaction. + */ + unsigned int event_generation; - unsigned int unhealable; - - unsigned int read_child_index; - unsigned char read_child_returned; - unsigned int first_up_child; + uint32_t open_fd_count; + gf_boolean_t update_open_fd_count; gf_lkowner_t saved_lk_owner; @@ -472,54 +310,111 @@ typedef struct _afr_local { int32_t **pending; + int dirty[AFR_NUM_CHANGE_LOGS]; + loc_t loc; loc_t newloc; fd_t *fd; + afr_fd_ctx_t *fd_ctx; - glusterfs_fop_t fop; - + /* @child_up: copy of priv->child_up taken at the time of transaction + start. The copy is taken so that we have a stable child_up array + through the phases of the transaction as priv->child_up[i] can keep + changing through time. + */ unsigned char *child_up; - int32_t *fresh_children; //in the order of response - int32_t *child_errno; + /* @read_attempted: + array of flags representing subvolumes where read operations of + the read transaction have already been attempted. The array is + first pre-filled with down subvolumes, and as reads are performed + on other subvolumes, those are set as well. This way if the read + operation fails we do not retry on that subvolume again. + */ + unsigned char *read_attempted; + + /* @readfn: - dict_t *xattr_req; + pointer to function which will perform the read operation on a given + subvolume. Used in read transactions. + */ - int32_t inodelk_count; - int32_t entrylk_count; + afr_read_txn_wind_t readfn; - afr_internal_lock_t internal_lock; + /* @refreshed: - afr_locked_fd_t *locked_fd; - int32_t source_child; - int32_t lock_recovery_child; + the inode was "refreshed" (i.e, pending xattrs from all subvols + freshly inspected and inode ctx updated accordingly) as part of + this transaction already. + */ + gf_boolean_t refreshed; + + /* @inode: + + the inode on which the read txn is performed on. ref'ed and copied + from either fd->inode or loc.inode + */ + + inode_t *inode; + + /* @parent[2]: + + parent inode[s] on which directory transactions are performed. + */ + + inode_t *parent; + inode_t *parent2; + + /* @readable: + + array of flags representing servers from which a read can be + performed. This is the output of afr_inode_refresh() + */ + unsigned char *readable; + + afr_inode_refresh_cbk_t refreshfn; + + /* @refreshinode: + + Inode currently getting refreshed. + */ + inode_t *refreshinode; + + /* + @pre_op_compat: + + compatibility mode of pre-op. send a separate pre-op and + op operations as part of transaction, rather than combining + */ + + gf_boolean_t pre_op_compat; + + dict_t *xattr_req; + + afr_internal_lock_t internal_lock; dict_t *dict; + int optimistic_change_log; gf_boolean_t delayed_post_op; - /* Is the current writev() going to perform a stable write? i.e, is fd->flags or @flags writev param have O_SYNC or O_DSYNC? */ - gf_boolean_t stable_write; - - /* This write appended to the file. Nnot necessarily O_APPEND, - just means the offset of write was at the end of file. - */ - gf_boolean_t append_write; - - int attempt_self_heal; - int foreground_self_heal; + gf_boolean_t stable_write; + /* This write appended to the file. Nnot necessarily O_APPEND, + just means the offset of write was at the end of file. + */ + gf_boolean_t append_write; - /* This struct contains the arguments for the "continuation" - (scheme-like) of fops + /* + This struct contains the arguments for the "continuation" + (scheme-like) of fops */ - int op; struct { struct { unsigned char buf_set; @@ -527,24 +422,6 @@ typedef struct _afr_local { } statfs; struct { - uint32_t parent_entrylk; - uuid_t gfid_req; - inode_t *inode; - struct iatt buf; - struct iatt postparent; - dict_t **xattrs; - dict_t *xattr; - struct iatt *postparents; - struct iatt *bufs; - int32_t read_child; - int32_t *sources; - int32_t *success_children; - int32_t **pending_matrix; - gf_boolean_t fresh_lookup; - gf_boolean_t possible_spb; - } lookup; - - struct { int32_t flags; } open; @@ -737,22 +614,67 @@ typedef struct _afr_local { afr_transaction_type type; - /* pre-compute the post piggyback status before - entering POST-OP phase - */ - int *postop_piggybacked; - /* stub to resume on destruction of the transaction frame */ call_stub_t *resume_stub; struct list_head eager_locked; - int32_t **txn_changelog;//changelog after pre+post ops unsigned char *pre_op; + /* @fop_subvols: subvolumes on which FOP will be attempted */ + unsigned char *fop_subvols; + + /* @failed_subvols: subvolumes on which FOP failed. Always + a subset of @fop_subvols */ + unsigned char *failed_subvols; + + /* @dirtied: flag which indicates whether we set dirty flag + in the OP. Typically true when we are performing operation + on more than one subvol and optimistic changelog is disabled + + A 'true' value set in @dirtied flag means an 'undirtying' + has to be done in POST-OP phase. + */ + gf_boolean_t dirtied; + + /* @inherited: flag which indicates that the dirty flags + of the previous transaction were inherited + */ + gf_boolean_t inherited; + + /* + @no_uninherit: flag which indicates that a pre_op_uninherit() + must _not_ be attempted (and returned as failure) always. This + flag is set when a hard pre-op is performed, but not accounted + for it in fd_ctx->on_disk[]. Such transactions are "isolated" + from the pre-op piggybacking entirely and therefore uninherit + must not be attempted. + */ + gf_boolean_t no_uninherit; + + /* @uninherit_done: + @uninherit_value: + + The above pair variables make pre_op_uninherit() idempotent. + Both are FALSE initially. The first call to pre_op_uninherit + sets @uninherit_done to TRUE and the return value to + @uninherit_value. Further calls will check for @uninherit_done + to be TRUE and if so will simply return @uninherit_value. + */ + gf_boolean_t uninherit_done; + gf_boolean_t uninherit_value; + + /* @changelog_resume: function to be called after changlogging + (either pre-op or post-op) is done + */ + + afr_changelog_resume_t changelog_resume; + call_frame_t *main_frame; + int (*wind) (call_frame_t *frame, xlator_t *this, int subvol); + int (*fop) (call_frame_t *frame, xlator_t *this); int (*done) (call_frame_t *frame, xlator_t *this); @@ -764,7 +686,7 @@ typedef struct _afr_local { /* post-op hook */ } transaction; - afr_self_heal_t self_heal; + syncbarrier_t barrier; struct marker_str marker; @@ -778,75 +700,58 @@ typedef struct _afr_local { struct afr_reply *replies; } afr_local_t; -typedef enum { - AFR_FD_NOT_OPENED, - AFR_FD_OPENED, - AFR_FD_OPENING -} afr_fd_open_status_t; - -typedef struct { - unsigned int *pre_op_done; - afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ - unsigned int *pre_op_piggyback; - - unsigned int *lock_piggyback; - unsigned int *lock_acquired; - - int flags; - uint64_t up_count; /* number of CHILD_UPs this fd has seen */ - uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */ - - int32_t last_tried; - - int hit, miss; - gf_boolean_t failed_over; - struct list_head entries; /* needed for readdir failover */ - - unsigned char *locked_on; /* which subvolumes locks have been successful */ - - /* used for delayed-post-op optimization */ - pthread_mutex_t delay_lock; - gf_timer_t *delay_timer; - call_frame_t *delay_frame; - int call_child; - - /* set if any write on this fd was a non stable write - (i.e, without O_SYNC or O_DSYNC) - */ - gf_boolean_t witnessed_unstable_write; - - /* list of frames currently in progress */ - struct list_head eager_locked; -} afr_fd_ctx_t; - - -/* try alloc and if it fails, goto label */ -#define AFR_LOCAL_ALLOC_OR_GOTO(var, label) do { \ - var = mem_get0 (THIS->local_pool); \ - if (!var) { \ - gf_log (this->name, GF_LOG_ERROR, \ - "out of memory :("); \ - op_errno = ENOMEM; \ - goto label; \ - } \ - } while (0); - /* did a call fail due to a child failing? */ #define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \ ((op_errno == ENOTCONN) || \ (op_errno == EBADFD))) -#define afr_fop_failed(op_ret, op_errno) ((op_ret) == -1) +int +afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int *event_generation); +int +__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int *event_generation); -/* have we tried all children? */ -#define all_tried(i, count) ((i) == (count) - 1) +int +__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvol, + int event_generation); +int +afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int event_generation); -int32_t -afr_set_dict_gfid (dict_t *dict, uuid_t gfid); +int +afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this); int -pump_command_reply (call_frame_t *frame, xlator_t *this); +afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, + unsigned char *readable); + +int +afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, + int type); +int +afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p, + int *event_p, afr_transaction_type type); + +#define afr_data_subvol_get(i, t, s, e) \ + afr_read_subvol_get(i, t, s, e, AFR_DATA_TRANSACTION) + +#define afr_metadata_subvol_get(i, t, s, e) \ + afr_read_subvol_get(i, t, s, e, AFR_METADATA_TRANSACTION) + +int +afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_inode_refresh_cbk_t cbk); int32_t afr_notify (xlator_t *this, int32_t event, void *data, void *data2); @@ -862,9 +767,6 @@ int afr_attempt_lock_recovery (xlator_t *this, int32_t child_index); int -afr_save_locked_fd (xlator_t *this, fd_t *fd); - -int afr_mark_locked_nodes (xlator_t *this, fd_t *fd, unsigned char *locked_nodes); @@ -874,10 +776,6 @@ afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner); int afr_set_lock_number (call_frame_t *frame, xlator_t *this); - -loc_t * -lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2); - int32_t afr_unlock (call_frame_t *frame, xlator_t *this); @@ -897,42 +795,26 @@ int afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, unsigned int child_count); -int pump_start (call_frame_t *frame, xlator_t *this); - int __afr_fd_ctx_set (xlator_t *this, fd_t *fd); int afr_fd_ctx_set (xlator_t *this, fd_t *fd); -int32_t -afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children); - -void -afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, - int32_t *fresh_children); +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this); int afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno); -unsigned int -afr_up_children_count (unsigned char *child_up, unsigned int child_count); - -unsigned int -afr_locked_children_count (unsigned char *children, unsigned int child_count); - -unsigned int -afr_pre_op_done_children_count (unsigned char *pre_op, - unsigned int child_count); +int +afr_locked_nodes_count (unsigned char *locked_nodes, int child_count); -gf_boolean_t -afr_is_fresh_lookup (loc_t *loc, xlator_t *this); +int +afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode); void -afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent); - -int -afr_locked_nodes_count (unsigned char *locked_nodes, int child_count); +afr_replies_wipe (afr_local_t *local, afr_private_t *priv); void afr_local_cleanup (afr_local_t *local, xlator_t *this); @@ -940,32 +822,16 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this); int afr_frame_return (call_frame_t *frame); -gf_boolean_t -afr_is_split_brain (xlator_t *this, inode_t *inode); - -void -afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, - afr_spb_state_t data_spb); - int afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata); void -afr_set_opendir_done (xlator_t *this, inode_t *inode); - -gf_boolean_t -afr_is_opendir_done (xlator_t *this, inode_t *inode); - -void afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this); int afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd); -int -afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd); - #define AFR_STACK_UNWIND(fop, frame, params ...) \ do { \ afr_local_t *__local = NULL; \ @@ -996,7 +862,16 @@ afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd); } \ } while (0); -#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ +#define AFR_FRAME_INIT(frame, op_errno) \ + ({frame->local = mem_get0 (THIS->local_pool); \ + if (afr_local_init (frame->local, THIS->private, &op_errno)) { \ + afr_local_cleanup (frame->local, THIS); \ + mem_put (frame->local); \ + frame->local = NULL; }; \ + frame->local;}) + +#define AFR_STACK_RESET(frame) do { int opr; STACK_RESET (frame->root); AFR_FRAME_INIT(frame, opr);} while (0) + /* allocate and return a string that is the basename of argument */ static inline char * AFR_BASENAME (const char *str) @@ -1009,6 +884,9 @@ AFR_BASENAME (const char *str) return __basename_str; } +call_frame_t * +afr_copy_frame (call_frame_t *base); + int afr_transaction_local_init (afr_local_t *local, xlator_t *this); @@ -1016,9 +894,6 @@ int32_t afr_marker_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,afr_local_t *local, afr_private_t *priv ); -int32_t * -afr_children_create (int32_t child_count); - int afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno); @@ -1027,101 +902,20 @@ afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, transaction_lk_type_t lk_type); int -afr_first_up_child (unsigned char *child_up, size_t child_count); +afr_higher_errno (int32_t old_errno, int32_t new_errno); int -afr_select_read_child_from_policy (int32_t *fresh_children, int32_t child_count, - int32_t prev_read_child, - int32_t config_read_child, int32_t *sources, - unsigned int hmode, uuid_t gfid); +afr_final_errno (afr_local_t *local, afr_private_t *priv); -void -afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, - int32_t *fresh_children, int32_t prev_read_child, - int32_t config_read_child, uuid_t gfid); - -int32_t -afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, - int32_t *fresh_children, - int32_t *call_child, int32_t *last_index); - -int32_t -afr_next_call_child (int32_t *fresh_children, unsigned char *child_up, - size_t child_count, int32_t *last_index, - int32_t read_child); -void -afr_get_fresh_children (int32_t *success_children, int32_t *sources, - int32_t *children, unsigned int child_count); -void -afr_children_add_child (int32_t *children, int32_t child, - int32_t child_count); -void -afr_children_rm_child (int32_t *children, int32_t child, - int32_t child_count); -void -afr_reset_children (int32_t *children, int32_t child_count); -int32_t -afr_most_important_error(int32_t old_errno, int32_t new_errno, - gf_boolean_t eio); int -afr_errno_count (int32_t *children, int *child_errno, - unsigned int child_count, int32_t op_errno); -int -afr_get_children_count (int32_t *children, unsigned int child_count); -gf_boolean_t -afr_is_child_present (int32_t *success_children, int32_t child_count, - int32_t child); -void -afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs, - int32_t *success_children, - unsigned int child_count); -void -afr_reset_xattr (dict_t **xattr, unsigned int child_count); -gf_boolean_t -afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, - unsigned int child_count, const char *path, - const char *xlator_name); -unsigned int -afr_gfid_missing_count (const char *xlator_name, int32_t *children, - struct iatt *bufs, unsigned int child_count, - const char *path); -void -afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path); -void -afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count); -afr_transaction_type -afr_transaction_type_get (ia_type_t ia_type); -int32_t -afr_resultant_errno_get (int32_t *children, - int *child_errno, unsigned int child_count); -void -afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, - int32_t *stale_children); -void -afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, - gf_boolean_t background, ia_type_t ia_type, char *reason, - void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, - xlator_t *this), - int (*unwind) (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno, - int32_t sh_failed)); -void -afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open); +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req); void -afr_open_fd_fix (fd_t *fd, xlator_t *this); -int -afr_set_elem_count_get (unsigned char *elems, int child_count); +afr_fix_open (fd_t *fd, xlator_t *this); afr_fd_ctx_t * afr_fd_ctx_get (fd_t *fd, xlator_t *this); -gf_boolean_t -afr_open_only_data_self_heal (char *data_self_heal); - -gf_boolean_t -afr_data_self_heal_enabled (char *data_self_heal); - void afr_set_low_priority (call_frame_t *frame); int @@ -1137,22 +931,9 @@ afr_matrix_cleanup (int32_t **pending, unsigned int m); int32_t** afr_matrix_create (unsigned int m, unsigned int n); -gf_boolean_t -afr_is_errno_set (int *child_errno, int child); - -gf_boolean_t -afr_is_errno_unset (int *child_errno, int child); - -gf_boolean_t -afr_is_fd_fixable (fd_t *fd); - void -afr_prepare_new_entry_pending_matrix (int32_t **pending, - gf_boolean_t (*is_pending) (int *, int), - int *ctx, struct iatt *buf, - unsigned int child_count); -void -afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count); +afr_filter_xattrs (dict_t *xattr); + /* * Special value indicating we should use the "auto" quorum method instead of * a fixed value (including zero to turn off quorum enforcement). @@ -1172,28 +953,6 @@ afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count); } \ } while (0); - -#define AFR_SBRAIN_MSG "Failed on %s as split-brain is seen. Returning EIO." - -#define AFR_SBRAIN_CHECK_FD(fd, label) do { \ - if (fd->inode && afr_is_split_brain (this, fd->inode)) { \ - op_errno = EIO; \ - gf_log (this->name, GF_LOG_WARNING, \ - AFR_SBRAIN_MSG ,uuid_utoa (fd->inode->gfid)); \ - goto label; \ - } \ -} while (0) - -#define AFR_SBRAIN_CHECK_LOC(loc, label) do { \ - if (loc->inode && afr_is_split_brain (this, loc->inode)) { \ - op_errno = EIO; \ - loc_path (loc, NULL); \ - gf_log (this->name, GF_LOG_WARNING, \ - AFR_SBRAIN_MSG , loc->path); \ - goto label; \ - } \ -} while (0) - int afr_fd_report_unstable_write (xlator_t *this, fd_t *fd); @@ -1209,7 +968,7 @@ afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count); void afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this); -afr_inode_ctx_t* -afr_inode_ctx_get (inode_t *inode, xlator_t *this); +int +afr_local_pathinfo (char *pathinfo, gf_boolean_t *is_local); #endif /* __AFR_H__ */ diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c index 987696e5583..eed5099563b 100644 --- a/xlators/cluster/afr/src/pump.c +++ b/xlators/cluster/afr/src/pump.c @@ -21,6 +21,120 @@ #include "afr-common.c" #include "defaults.c" #include "glusterfs.h" +#include "pump.h" + + +static int +afr_set_dict_gfid (dict_t *dict, uuid_t gfid) +{ + int ret = 0; + uuid_t *pgfid = NULL; + + GF_ASSERT (gfid); + + pgfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char); + if (!pgfid) { + ret = -1; + goto out; + } + + uuid_copy (*pgfid, gfid); + + ret = dict_set_dynptr (dict, "gfid-req", pgfid, sizeof (uuid_t)); + if (ret) + gf_log (THIS->name, GF_LOG_ERROR, "gfid set failed"); + +out: + if (ret && pgfid) + GF_FREE (pgfid); + return ret; +} + +static int +afr_set_root_gfid (dict_t *dict) +{ + uuid_t gfid; + int ret = 0; + + memset (gfid, 0, 16); + gfid[15] = 1; + + ret = afr_set_dict_gfid (dict, gfid); + + return ret; +} + +static int +afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +{ + int ret = -1; + uuid_t pargfid = {0}; + + if (!child) + goto out; + + if (!uuid_is_null (parent->inode->gfid)) + uuid_copy (pargfid, parent->inode->gfid); + else if (!uuid_is_null (parent->gfid)) + uuid_copy (pargfid, parent->gfid); + + if (uuid_is_null (pargfid)) + goto out; + + if (strcmp (parent->path, "/") == 0) + ret = gf_asprintf ((char **)&child->path, "/%s", name); + else + ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path, + name); + + if (-1 == ret) { + gf_log (this->name, GF_LOG_ERROR, + "asprintf failed while setting child path"); + } + + child->name = strrchr (child->path, '/'); + if (child->name) + child->name++; + + child->parent = inode_ref (parent->inode); + child->inode = inode_new (parent->inode->table); + uuid_copy (child->pargfid, pargfid); + + if (!child->inode) { + ret = -1; + goto out; + } + + ret = 0; +out: + if ((ret == -1) && child) + loc_wipe (child); + + return ret; +} + +static void +afr_build_root_loc (xlator_t *this, loc_t *loc) +{ + afr_private_t *priv = NULL; + + priv = this->private; + loc->path = gf_strdup ("/"); + loc->name = ""; + loc->inode = inode_ref (priv->root_inode); + uuid_copy (loc->gfid, loc->inode->gfid); +} + +static void +afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) +{ + GF_ASSERT (loc); + GF_ASSERT (buf); + + uuid_copy (loc->gfid, buf->ia_gfid); + if (postparent) + uuid_copy (loc->pargfid, postparent->ia_gfid); +} static uint64_t pump_pid = 0; static inline void @@ -387,54 +501,68 @@ gf_pump_traverse_directory (loc_t *loc) if (ret) goto out; - if (!IS_ENTRY_CWD (entry->d_name) && - !IS_ENTRY_PARENT (entry->d_name)) { - - is_directory_empty = _gf_false; - gf_log (this->name, GF_LOG_DEBUG, - "lookup %s => %"PRId64, - entry_loc.path, - iatt.ia_ino); - - ret = syncop_lookup (this, &entry_loc, NULL, - &iatt, &xattr_rsp, &parent); - - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "%s: lookup failed", - entry_loc.path); - continue; - } - pump_fill_loc_info (&entry_loc, &iatt, - &parent); - - pump_update_resume_state (this, entry_loc.path); - - pump_save_path (this, entry_loc.path); - pump_save_file_stats (this, entry_loc.path); - - ret = pump_check_and_update_status (this); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Pump beginning to exit out"); - goto out; - } - - if (IA_ISDIR (iatt.ia_type)) { - if (is_pump_traversal_allowed (this, entry_loc.path)) { - gf_log (this->name, GF_LOG_TRACE, - "entering dir=%s", - entry->d_name); - gf_pump_traverse_directory (&entry_loc); - } - } + if ((strcmp (entry->d_name, ".") == 0) || + (strcmp (entry->d_name, "..") == 0)) + continue; + + is_directory_empty = _gf_false; + gf_log (this->name, GF_LOG_DEBUG, + "lookup %s => %"PRId64, + entry_loc.path, + iatt.ia_ino); + + ret = syncop_lookup (this, &entry_loc, NULL, &iatt, + &xattr_rsp, &parent); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: lookup failed", entry_loc.path); + continue; + } + + ret = afr_selfheal_name (this, loc->gfid, entry->d_name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: name self-heal failed (%s/%s)", + entry_loc.path, uuid_utoa (loc->gfid), + entry->d_name); + continue; + } + + ret = afr_selfheal (this, iatt.ia_gfid); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: self-heal failed (%s)", + entry_loc.path, uuid_utoa (iatt.ia_gfid)); + continue; + } + + pump_fill_loc_info (&entry_loc, &iatt, &parent); + + pump_update_resume_state (this, entry_loc.path); + + pump_save_path (this, entry_loc.path); + pump_save_file_stats (this, entry_loc.path); + + ret = pump_check_and_update_status (this); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "Pump beginning to exit out"); + goto out; + } + + if (IA_ISDIR (iatt.ia_type)) { + if (is_pump_traversal_allowed (this, entry_loc.path)) { + gf_log (this->name, GF_LOG_TRACE, + "entering dir=%s", entry->d_name); + gf_pump_traverse_directory (&entry_loc); + } } } gf_dirent_free (&entries); free_entries = _gf_false; - gf_log (this->name, GF_LOG_TRACE, - "offset incremented to %d", + gf_log (this->name, GF_LOG_TRACE, "offset incremented to %d", (int32_t ) offset); } @@ -443,7 +571,7 @@ gf_pump_traverse_directory (loc_t *loc) if (ret < 0) gf_log (this->name, GF_LOG_DEBUG, "closing the fd failed"); - if (is_directory_empty && IS_ROOT_PATH (loc->path)) { + if (is_directory_empty && (strcmp (loc->path, "/") == 0)) { pump_change_state (this, PUMP_STATE_RUNNING); gf_log (this->name, GF_LOG_INFO, "Empty source brick. " "Nothing to be done."); @@ -1277,128 +1405,16 @@ out: } -struct _xattr_key { - char *key; - struct list_head list; -}; - -static int -__gather_xattr_keys (dict_t *dict, char *key, data_t *value, - void *data) -{ - struct list_head * list = data; - struct _xattr_key * xkey = NULL; - - if (!strncmp (key, AFR_XATTR_PREFIX, - strlen (AFR_XATTR_PREFIX))) { - - xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key); - if (!xkey) - return -1; - - xkey->key = key; - INIT_LIST_HEAD (&xkey->list); - - list_add_tail (&xkey->list, list); - } - return 0; -} - -static void -__filter_xattrs (dict_t *dict) -{ - struct list_head keys; - - struct _xattr_key *key; - struct _xattr_key *tmp; - - INIT_LIST_HEAD (&keys); - - dict_foreach (dict, __gather_xattr_keys, - (void *) &keys); - - list_for_each_entry_safe (key, tmp, &keys, list) { - dict_del (dict, key->key); - - list_del_init (&key->list); - - GF_FREE (key); - } -} - -int32_t -pump_getxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - - priv = this->private; - children = priv->children; - - local = frame->local; - - read_child = (long) cookie; - - if (op_ret == -1) { - last_index = &local->cont.getxattr.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; - - unwind = 0; - STACK_WIND_COOKIE (frame, pump_getxattr_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->getxattr, - &local->loc, - local->cont.getxattr.name, NULL); - } - -out: - if (unwind) { - if (op_ret >= 0 && dict) - __filter_xattrs (dict); - - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, NULL); - } - - return 0; -} - -int32_t -pump_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name, dict_t *xdata) +int +pump_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) { - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - int32_t ret = -1; - int32_t op_errno = 0; - uint64_t read_child = 0; - - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_private_t *priv = NULL; + int op_errno = 0; + int ret = 0; - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + priv = this->private; - children = priv->children; if (!priv->use_afr_in_pump) { STACK_WIND (frame, default_getxattr_cbk, FIRST_CHILD (this), @@ -1407,14 +1423,6 @@ pump_getxattr (call_frame_t *frame, xlator_t *this, return 0; } - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - if (name) { if (!strncmp (name, AFR_XATTR_PREFIX, strlen (AFR_XATTR_PREFIX))) { @@ -1432,32 +1440,7 @@ pump_getxattr (call_frame_t *frame, xlator_t *this, } } - local->fresh_children = GF_CALLOC (priv->child_count, - sizeof (*local->fresh_children), - gf_afr_mt_int32_t); - if (!local->fresh_children) { - ret = -1; - op_errno = ENOMEM; - goto out; - } - - read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.getxattr.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } - loc_copy (&local->loc, loc); - if (name) - local->cont.getxattr.name = gf_strdup (name); - - STACK_WIND_COOKIE (frame, pump_getxattr_cbk, - (void *) (long) call_child, - children[call_child], children[call_child]->fops->getxattr, - loc, name, xdata); + afr_getxattr (frame, this, loc, name, xdata); ret = 0; out: @@ -1466,134 +1449,6 @@ out: return 0; } -static int -afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (setxattr, main_frame, - local->op_ret, local->op_errno, NULL); - } - return 0; -} - -static int -afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->child_count) { - need_unwind = 1; - } - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; -} - -static int -afr_setxattr_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (local->child_up, priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, - local->cont.setxattr.dict, - local->cont.setxattr.flags, NULL); - - if (!--call_count) - break; - } - } - - return 0; -} - - -static int -afr_setxattr_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - -int32_t -pump_setxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, dict_t *xdata) -{ - AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); - return 0; -} - int pump_command_reply (call_frame_t *frame, xlator_t *this) { @@ -1617,51 +1472,56 @@ pump_command_reply (call_frame_t *frame, xlator_t *this) } int -pump_parse_command (call_frame_t *frame, xlator_t *this, - afr_local_t *local, dict_t *dict) +pump_parse_command (call_frame_t *frame, xlator_t *this, dict_t *dict, + int *op_errno_p) { - + afr_local_t *local = NULL; int ret = -1; + int op_errno = 0; if (pump_command_start (this, dict)) { - frame->local = local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; local->dict = dict_ref (dict); ret = pump_execute_start (frame, this); } else if (pump_command_pause (this, dict)) { - frame->local = local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; local->dict = dict_ref (dict); ret = pump_execute_pause (frame, this); } else if (pump_command_abort (this, dict)) { - frame->local = local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; local->dict = dict_ref (dict); ret = pump_execute_abort (frame, this); } else if (pump_command_commit (this, dict)) { - frame->local = local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; local->dict = dict_ref (dict); ret = pump_execute_commit (frame, this); } +out: + if (op_errno_p) + *op_errno_p = op_errno; return ret; } int -pump_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) +pump_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + afr_private_t *priv = NULL; int ret = -1; int op_errno = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.pump*", dict, - op_errno, out); + GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.pump*", dict, op_errno, out); priv = this->private; if (!priv->use_afr_in_pump) { @@ -1672,57 +1532,15 @@ pump_setxattr (call_frame_t *frame, xlator_t *this, return 0; } - - AFR_LOCAL_ALLOC_OR_GOTO (local, out); - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) { - afr_local_cleanup (local, this); - mem_put (local); - goto out; - } - - ret = pump_parse_command (frame, this, - local, dict); - if (ret >= 0) { - ret = 0; + ret = pump_parse_command (frame, this, dict, &op_errno); + if (ret >= 0) goto out; - } - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - op_errno = ENOMEM; - ret = -1; - afr_local_cleanup (local, this); - goto out; - } - - transaction_frame->local = local; - - local->op_ret = -1; - - local->cont.setxattr.dict = dict_ref (dict); - local->cont.setxattr.flags = flags; - - local->transaction.fop = afr_setxattr_wind; - local->transaction.done = afr_setxattr_done; - local->transaction.unwind = afr_setxattr_unwind; - - loc_copy (&local->loc, loc); - - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + afr_setxattr (frame, this, loc, dict, flags, xdata); ret = 0; out: if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); } @@ -2416,10 +2234,6 @@ init (xlator_t *this) goto out; LOCK_INIT (&priv->lock); - LOCK_INIT (&priv->read_child_lock); - //lock recovery is not done in afr - pthread_mutex_init (&priv->mutex, NULL); - INIT_LIST_HEAD (&priv->saved_fds); child_count = xlator_subvolume_count (this); if (child_count != 2) { @@ -2453,8 +2267,6 @@ init (xlator_t *this) and the sink. */ - priv->strict_readdir = _gf_false; - priv->wait_count = 1; priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, gf_afr_mt_char); if (!priv->child_up) { @@ -2508,7 +2320,6 @@ init (xlator_t *this) goto out; } - priv->first_lookup = 1; priv->root_inode = NULL; priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event), @@ -2579,7 +2390,6 @@ out: GF_FREE (priv->pending_key); GF_FREE (priv->last_event); LOCK_DESTROY (&priv->lock); - LOCK_DESTROY (&priv->read_child_lock); GF_FREE (priv); } diff --git a/xlators/cluster/afr/src/pump.h b/xlators/cluster/afr/src/pump.h index bc4c31a78a5..9d0b6db6a5e 100644 --- a/xlators/cluster/afr/src/pump.h +++ b/xlators/cluster/afr/src/pump.h @@ -75,4 +75,7 @@ pump_command_status (xlator_t *this, dict_t *dict); int pump_execute_status (call_frame_t *frame, xlator_t *this); +int +pump_command_reply (call_frame_t *frame, xlator_t *this); + #endif /* __PUMP_H__ */ diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 3055f4615cf..3868fc38fd5 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -3120,7 +3120,7 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, /* making sure we set the inode ctx right with layout, currently possible only for non-directories, so for directories don't set entry inodes */ - if (!IA_ISDIR(entry->d_stat.ia_type)) { + if (!IA_ISDIR(entry->d_stat.ia_type) && orig_entry->inode) { ret = dht_layout_preset (this, prev->this, orig_entry->inode); if (ret) diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c index 32d53e8e6e2..79e80b51381 100644 --- a/xlators/cluster/stripe/src/stripe.c +++ b/xlators/cluster/stripe/src/stripe.c @@ -4886,7 +4886,7 @@ unlock: if (!local_entry) break; - if (!IA_ISREG (local_entry->d_stat.ia_type)) { + if (!IA_ISREG (local_entry->d_stat.ia_type) || !local_entry->inode) { LOCK (&frame->lock); { local->wind_count--; diff --git a/xlators/features/index/src/index.c b/xlators/features/index/src/index.c index 5edfeda8f30..5c1c65fbd21 100644 --- a/xlators/features/index/src/index.c +++ b/xlators/features/index/src/index.c @@ -15,11 +15,9 @@ #include "index.h" #include "options.h" #include "glusterfs3-xdr.h" -#include "syncop.h" #include "syscall.h" #define XATTROP_SUBDIR "xattrop" -#define BASE_INDICES_HOLDER_SUBDIR "base_indices_holder" call_stub_t * __index_dequeue (struct list_head *callstubs) @@ -245,40 +243,20 @@ check_delete_stale_index_file (xlator_t *this, char *filename) { int ret = 0; struct stat st = {0}; - struct stat base_index_st = {0}; char filepath[PATH_MAX] = {0}; - char filepath_under_base_indices_holder[PATH_MAX] = {0}; index_priv_t *priv = NULL; priv = this->private; - if (priv->to_be_healed_states != synced_state) - return; - make_file_path (priv->index_basepath, XATTROP_SUBDIR, filename, filepath, sizeof (filepath)); - - make_file_path (priv->index_basepath, BASE_INDICES_HOLDER_SUBDIR, - filename, filepath_under_base_indices_holder, - sizeof (filepath_under_base_indices_holder)); - - - ret = stat (filepath_under_base_indices_holder, &base_index_st); - if (ret) { - gf_log (THIS->name, GF_LOG_ERROR, "Base index is not created" - " under index/base_indices_holder"); - return; - } - ret = stat (filepath, &st); - if (!ret && st.st_nlink == 2) { + if (!ret && st.st_nlink == 1) unlink (filepath); - unlink (filepath_under_base_indices_holder); - } } static int index_fill_readdir (fd_t *fd, DIR *dir, off_t off, - size_t size, gf_dirent_t *entries, readdir_directory type) + size_t size, gf_dirent_t *entries) { off_t in_case = -1; size_t filled = 0; @@ -321,8 +299,7 @@ index_fill_readdir (fd_t *fd, DIR *dir, off_t off, } if (!strncmp (entry->d_name, XATTROP_SUBDIR"-", - strlen (XATTROP_SUBDIR"-")) && - (type == INDEX_XATTROP)) { + strlen (XATTROP_SUBDIR"-"))) { check_delete_stale_index_file (this, entry->d_name); continue; } @@ -361,192 +338,16 @@ out: } int -sync_base_indices (void *index_priv) -{ - index_priv_t *priv = NULL; - DIR *dir_base_holder = NULL; - DIR *xattrop_dir = NULL; - struct dirent *entry = NULL; - char base_indices_holder[PATH_MAX] = {0}; - char xattrop_directory[PATH_MAX] = {0}; - char base_index_path[PATH_MAX] = {0}; - char xattrop_index_path[PATH_MAX] = {0}; - int32_t op_errno = 0; - int ret = 0; - - priv = index_priv; - - snprintf (base_indices_holder, PATH_MAX, "%s/%s", priv->index_basepath, - BASE_INDICES_HOLDER_SUBDIR); - snprintf (xattrop_directory, PATH_MAX, "%s/%s", priv->index_basepath, - XATTROP_SUBDIR); - - if ((dir_base_holder = opendir(base_indices_holder)) == NULL) { - op_errno = errno; - ret = -1; - goto out; - } - if ((xattrop_dir = opendir (xattrop_directory)) == NULL) { - op_errno = errno; - ret = -1; - (void) closedir (dir_base_holder); - goto out; - } - - priv->to_be_healed_states = sync_started; - while ((entry = readdir(xattrop_dir)) != NULL) { - if (!strcmp (entry->d_name, ".") || - !strcmp (entry->d_name, "..")) { - continue; - } - if (strncmp (entry->d_name, XATTROP_SUBDIR"-", - strlen (XATTROP_SUBDIR"-"))) { - continue; - } - if (!strncmp (entry->d_name, XATTROP_SUBDIR"-", - strlen (XATTROP_SUBDIR"-"))) { - - snprintf (xattrop_index_path, PATH_MAX, "%s/%s", - xattrop_directory, entry->d_name); - - snprintf (base_index_path, PATH_MAX, "%s/%s", - base_indices_holder, entry->d_name); - - ret = sys_link (xattrop_index_path, base_index_path); - - if (ret && errno != EEXIST) { - op_errno = errno; - (void) closedir (dir_base_holder); - (void) closedir (xattrop_dir); - goto out; - } - - } - } - ret = closedir (xattrop_dir); - if (ret) { - op_errno = errno; - (void) closedir (dir_base_holder); - goto out; - } - ret = closedir (dir_base_holder); - if (ret) { - op_errno = errno; - goto out; - } - - ret = 0; -out: - errno = op_errno; - return ret; - -} - -int -base_indices_syncing_done (int ret, call_frame_t *frame, void *data) -{ - index_priv_t *priv = NULL; - priv = data; - - if (!priv) - goto out; - - if (ret) { - priv->to_be_healed_states = sync_not_started; - } else { - priv->to_be_healed_states = synced_state; - } - - STACK_DESTROY (frame->root); - -out: - return 0; -} - -int -sync_base_indices_from_xattrop (xlator_t *this) -{ - - index_priv_t *priv = NULL; - char base_indices_holder[PATH_MAX] = {0}; - int ret = 0; - struct stat st = {0}; - DIR *dir = NULL; - struct dirent *entry = NULL; - call_frame_t *frame = NULL; - - priv = this->private; - - if (priv->to_be_healed_states != sync_not_started) { - ret = -1; - goto out; - } - - snprintf (base_indices_holder, PATH_MAX, "%s/%s", priv->index_basepath, - BASE_INDICES_HOLDER_SUBDIR); - - ret = stat (base_indices_holder, &st); - - if (ret && (errno != ENOENT)) { - goto out; - } else if (errno == ENOENT) { - ret = index_dir_create (this, BASE_INDICES_HOLDER_SUBDIR); - if (ret) - goto out; - } else { - if ((dir = opendir (base_indices_holder)) == NULL) { - ret = -1; - goto out; - } - while ((entry = readdir (dir)) != NULL) { - if (!strcmp (entry->d_name, ".") || - !strcmp (entry->d_name,"..")) { - continue; - } - ret = unlink (entry->d_name); - if (ret) { - closedir (dir); - goto out; - } - } - closedir (dir); - } - - /*At this point of time we have index/base_indicies_holder directory - *is with no entries*/ - - frame = create_frame (this, this->ctx->pool); - if (!frame) { - ret = -1; - goto out; - } - set_lk_owner_from_ptr (&frame->root->lk_owner, frame->root); - - frame->root->pid = LOW_PRIO_PROC_PID; - - ret = synctask_new (this->ctx->env, sync_base_indices, - base_indices_syncing_done,frame, priv); - - - -out: - return ret; - -} - -int index_add (xlator_t *this, uuid_t gfid, const char *subdir) { int32_t op_errno = 0; char gfid_path[PATH_MAX] = {0}; char index_path[PATH_MAX] = {0}; - char base_path[PATH_MAX] = {0}; int ret = 0; uuid_t index = {0}; index_priv_t *priv = NULL; struct stat st = {0}; int fd = 0; - int index_created = 0; priv = this->private; GF_ASSERT_AND_GOTO_WITH_ERROR (this->name, !uuid_is_null (gfid), @@ -561,15 +362,12 @@ index_add (xlator_t *this, uuid_t gfid, const char *subdir) index_get_index (priv, index); make_index_path (priv->index_basepath, subdir, index, index_path, sizeof (index_path)); - ret = sys_link (index_path, gfid_path); if (!ret || (errno == EEXIST)) { ret = 0; - index_created = 1; goto out; } - op_errno = errno; if (op_errno == ENOENT) { ret = index_dir_create (this, subdir); @@ -601,36 +399,10 @@ index_add (xlator_t *this, uuid_t gfid, const char *subdir) "add to index (%s)", uuid_utoa (gfid), strerror (errno)); goto out; - } else { - index_created = 1; - } - - if (priv->to_be_healed_states != sync_not_started) { - make_index_path (priv->index_basepath, - GF_BASE_INDICES_HOLDER_GFID, - index, base_path, sizeof (base_path)); - ret = sys_link (index_path, base_path); - if (ret) - goto out; } ret = 0; out: - /*If base_indices_holder is not created: create and sync - *If directory is present: delete contents and start syncing - *If syncing is in progress :No need to do any thing - *If syncing is done: No need to do anything*/ - if (!ret) { - switch (priv->to_be_healed_states) { - case sync_not_started: - ret = sync_base_indices_from_xattrop (this); - break; - case sync_started: - case synced_state: - /*No need to do anything*/ - break; - } - } return ret; } @@ -966,6 +738,41 @@ out: return 0; } +uint64_t +index_entry_count (xlator_t *this, char *subdir) +{ + index_priv_t *priv = NULL; + char index_dir[PATH_MAX]; + DIR *dirp = NULL; + uint64_t count = 0; + struct dirent buf; + struct dirent *entry = NULL; + + priv = this->private; + + make_index_dir_path (priv->index_basepath, subdir, + index_dir, sizeof (index_dir)); + + dirp = opendir (index_dir); + if (!dirp) + return 0; + + while (readdir_r (dirp, &buf, &entry) == 0) { + if (!entry) + break; + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + if (!strncmp (entry->d_name, subdir, strlen (subdir))) + continue; + count++; + } + closedir (dirp); + + return count; +} + + int32_t index_getxattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, dict_t *xdata) @@ -973,6 +780,7 @@ index_getxattr_wrapper (call_frame_t *frame, xlator_t *this, index_priv_t *priv = NULL; dict_t *xattr = NULL; int ret = 0; + uint64_t count = 0; priv = this->private; @@ -982,24 +790,26 @@ index_getxattr_wrapper (call_frame_t *frame, xlator_t *this, goto done; } - if (!strcmp (name, GF_XATTROP_INDEX_GFID)) { - - ret = dict_set_static_bin (xattr, (char*)name, - priv->xattrop_vgfid, - sizeof (priv->xattrop_vgfid)); - - } else if (!strcmp (name, GF_BASE_INDICES_HOLDER_GFID)) { - - ret = dict_set_static_bin (xattr, (char*)name, - priv->base_indices_holder_vgfid, - sizeof (priv->base_indices_holder_vgfid)); - } - if (ret) { - ret = -ENOMEM; - gf_log (THIS->name, GF_LOG_ERROR, "xattrop index " - "gfid set failed"); - goto done; - } + if (strcmp (name, GF_XATTROP_INDEX_GFID) == 0) { + ret = dict_set_static_bin (xattr, (char*)name, priv->xattrop_vgfid, + sizeof (priv->xattrop_vgfid)); + if (ret) { + ret = -ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "xattrop index " + "gfid set failed"); + goto done; + } + } else if (strcmp (name, GF_XATTROP_INDEX_COUNT) == 0) { + count = index_entry_count (this, XATTROP_SUBDIR); + + ret = dict_set_uint64 (xattr, (char *)name, count); + if (ret) { + ret = -ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "xattrop index " + "count set failed"); + goto done; + } + } done: if (ret) STACK_UNWIND_STRICT (getxattr, frame, -1, -ret, xattr, xdata); @@ -1037,15 +847,6 @@ index_lookup_wrapper (call_frame_t *frame, xlator_t *this, } else if (!uuid_compare (loc->pargfid, priv->xattrop_vgfid)) { make_file_path (priv->index_basepath, XATTROP_SUBDIR, loc->name, path, sizeof (path)); - } else if (!uuid_compare (loc->gfid,priv->base_indices_holder_vgfid)){ - make_index_dir_path (priv->index_basepath, - BASE_INDICES_HOLDER_SUBDIR, path, - sizeof (path)); - is_dir = _gf_true; - } else if (!uuid_compare (loc->pargfid, priv->base_indices_holder_vgfid)) { - make_file_path (priv->index_basepath, - BASE_INDICES_HOLDER_SUBDIR,loc->name, path, - sizeof (path)); } ret = lstat (path, &lstatbuf); @@ -1067,14 +868,10 @@ index_lookup_wrapper (call_frame_t *frame, xlator_t *this, } iatt_from_stat (&stbuf, &lstatbuf); - if (is_dir && !uuid_compare (loc->gfid, priv->xattrop_vgfid)) { + if (is_dir) uuid_copy (stbuf.ia_gfid, priv->xattrop_vgfid); - } else if (is_dir && - !uuid_compare (loc->gfid, priv->base_indices_holder_vgfid)) { - uuid_copy (stbuf.ia_gfid, priv->base_indices_holder_vgfid); - } else { + else uuid_generate (stbuf.ia_gfid); - } stbuf.ia_ino = -1; op_ret = 0; done: @@ -1086,44 +883,6 @@ done: } int32_t -base_indices_readdir_wrapper (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off, dict_t *xdata) -{ - index_priv_t *priv = NULL; - char base_indices_holder[PATH_MAX] = {0}; - DIR *dir = NULL; - int32_t op_ret = -1; - int32_t op_errno = 0; - int count = 0; - gf_dirent_t entries; - - priv = this->private; - - make_index_dir_path (priv->index_basepath, BASE_INDICES_HOLDER_SUBDIR, - base_indices_holder, sizeof (base_indices_holder)); - - dir = opendir (base_indices_holder); - if (!dir) { - op_errno = EINVAL; - goto done; - } - - - INIT_LIST_HEAD (&entries.list); - - count = index_fill_readdir (fd, dir, off, size, &entries, - BASE_INDICES_HOLDER); - /* pick ENOENT to indicate EOF */ - op_errno = errno; - op_ret = count; - closedir (dir); -done: - STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, &entries, xdata); - gf_dirent_free (&entries); - return 0; -} - -int32_t index_readdir_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t off, dict_t *xdata) { @@ -1154,8 +913,7 @@ index_readdir_wrapper (call_frame_t *frame, xlator_t *this, goto done; } - count = index_fill_readdir (fd, dir, off, size, &entries, - INDEX_XATTROP); + count = index_fill_readdir (fd, dir, off, size, &entries); /* pick ENOENT to indicate EOF */ op_errno = errno; @@ -1221,11 +979,12 @@ index_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, dict_t *xdata) { call_stub_t *stub = NULL; + index_priv_t *priv = NULL; - if (!name) - goto out; - if (strcmp (GF_XATTROP_INDEX_GFID, name) && - strcmp (GF_BASE_INDICES_HOLDER_GFID, name)) + priv = this->private; + + if (!name || (strcmp (GF_XATTROP_INDEX_GFID, name) && + strcmp (GF_XATTROP_INDEX_COUNT, name))) goto out; stub = fop_getxattr_stub (frame, index_getxattr_wrapper, loc, name, @@ -1252,9 +1011,7 @@ index_lookup (call_frame_t *frame, xlator_t *this, priv = this->private; if (uuid_compare (loc->gfid, priv->xattrop_vgfid) && - uuid_compare (loc->pargfid, priv->xattrop_vgfid) && - uuid_compare (loc->gfid, priv->base_indices_holder_vgfid) && - uuid_compare (loc->pargfid, priv->base_indices_holder_vgfid)) + uuid_compare (loc->pargfid, priv->xattrop_vgfid)) goto normal; stub = fop_lookup_stub (frame, index_lookup_wrapper, loc, xattr_req); @@ -1280,19 +1037,10 @@ index_readdir (call_frame_t *frame, xlator_t *this, index_priv_t *priv = NULL; priv = this->private; - if (uuid_compare (fd->inode->gfid, priv->xattrop_vgfid) && - uuid_compare (fd->inode->gfid, priv->base_indices_holder_vgfid)) + if (uuid_compare (fd->inode->gfid, priv->xattrop_vgfid)) goto out; - - if (!uuid_compare (fd->inode->gfid, priv->xattrop_vgfid)) { - stub = fop_readdir_stub (frame, index_readdir_wrapper, fd, size, - off, xdata); - } else if (!uuid_compare (fd->inode->gfid, - priv->base_indices_holder_vgfid)) { - stub = fop_readdir_stub (frame, base_indices_readdir_wrapper, - fd, size, off, xdata); - } - + stub = fop_readdir_stub (frame, index_readdir_wrapper, fd, size, off, + xdata); if (!stub) { STACK_UNWIND_STRICT (readdir, frame, -1, ENOMEM, NULL, NULL); return 0; @@ -1396,9 +1144,6 @@ init (xlator_t *this) GF_OPTION_INIT ("index-base", priv->index_basepath, path, out); uuid_generate (priv->index); uuid_generate (priv->xattrop_vgfid); - /*base_indices_holder is a directory which contains hard links to - * all base indices inside indices/xattrop directory*/ - uuid_generate (priv->base_indices_holder_vgfid); INIT_LIST_HEAD (&priv->callstubs); this->private = priv; @@ -1415,7 +1160,6 @@ init (xlator_t *this) } ret = 0; - out: if (ret) { if (cond_inited) diff --git a/xlators/features/index/src/index.h b/xlators/features/index/src/index.h index d6dcb1c23b4..661dcdbc417 100644 --- a/xlators/features/index/src/index.h +++ b/xlators/features/index/src/index.h @@ -36,28 +36,14 @@ typedef struct index_fd_ctx { DIR *dir; } index_fd_ctx_t; -typedef enum { - sync_not_started, - sync_started, - synced_state, -} to_be_healed_states_t; - -typedef enum { - INDEX_XATTROP, - BASE_INDICES_HOLDER, -} readdir_directory; - typedef struct index_priv { char *index_basepath; uuid_t index; gf_lock_t lock; uuid_t xattrop_vgfid;//virtual gfid of the xattrop index dir - uuid_t base_indices_holder_vgfid; //virtual gfid of the - //to_be_healed_xattrop directory struct list_head callstubs; pthread_mutex_t mutex; pthread_cond_t cond; - to_be_healed_states_t to_be_healed_states; } index_priv_t; #define INDEX_STACK_UNWIND(fop, frame, params ...) \ |