diff options
Diffstat (limited to 'xlators/cluster')
68 files changed, 22946 insertions, 16743 deletions
diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am index 16ed25af1..35d18a6c0 100644 --- a/xlators/cluster/afr/src/Makefile.am +++ b/xlators/cluster/afr/src/Makefile.am @@ -1,21 +1,31 @@ xlator_LTLIBRARIES = afr.la pump.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster -afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c afr-self-heal-algorithm.c afr-lk-common.c afr-self-heald.c $(top_builddir)/xlators/lib/src/libxlator.c +afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \ + afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c \ + afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c \ + afr-self-heal-algorithm.c afr-lk-common.c afr-self-heald.c \ + $(top_builddir)/xlators/lib/src/libxlator.c -afr_la_LDFLAGS = -module -avoidversion +afr_la_LDFLAGS = -module -avoid-version afr_la_SOURCES = $(afr_common_source) afr.c afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -pump_la_LDFLAGS = -module -avoidversion +pump_la_LDFLAGS = -module -avoid-version pump_la_SOURCES = $(afr_common_source) pump.c pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h afr-self-heal-algorithm.h pump.h afr-mem-types.h afr-common.c afr-self-heald.h $(top_builddir)/xlators/lib/src/libxlator.h $(top_builddir)/glusterfsd/src/glusterfsd.h +noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h \ + afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h \ + afr-self-heal-algorithm.h pump.h afr-mem-types.h afr-common.c \ + afr-self-heald.h $(top_builddir)/xlators/lib/src/libxlator.h \ + $(top_builddir)/glusterfsd/src/glusterfsd.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/contrib/md5 -shared -nostartfiles $(GF_CFLAGS) \ - -I$(top_srcdir)/xlators/lib/src +AM_CPPFLAGS = $(GF_CPPFLAGS) \ + -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ + -I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 0e4e97355..af01f2ef2 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -44,6 +35,7 @@ #include "compat.h" #include "byte-order.h" #include "statedump.h" +#include "inode.h" #include "fd.h" @@ -57,10 +49,9 @@ #include "afr-self-heald.h" #include "pump.h" -#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000200000000ULL -#define AFR_ICTX_SPLIT_BRAIN_MASK 0x0000000100000000ULL +#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000100000000ULL #define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL - +#define AFR_STATISTICS_HISTORY_SIZE 50 int afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, gf_boolean_t fail_conflict); @@ -91,6 +82,75 @@ afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path) path, priv->pending_key[i]); /* 3 = data+metadata+entry */ } + ret = dict_set_int32 (xattr_req, GF_GFIDLESS_LOOKUP, 1); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "%s: failed to set gfidless " + "lookup", path); + } +} + +int +afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this, + dict_t *xattr_req, loc_t *loc, void **gfid_req) +{ + int ret = -ENOMEM; + + GF_ASSERT (gfid_req); + + *gfid_req = NULL; + local->xattr_req = dict_new (); + if (!local->xattr_req) + goto out; + if (xattr_req) + dict_copy (xattr_req, local->xattr_req); + + afr_xattr_req_prepare (this, local->xattr_req, loc->path); + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_INODELK_COUNT); + } + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_ENTRYLK_COUNT); + } + + ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_PARENT_ENTRYLK); + } + + ret = dict_get_ptr (local->xattr_req, "gfid-req", gfid_req); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "%s: failed to get the gfid from dict", loc->path); + *gfid_req = NULL; + } else { + if (loc->parent != NULL) + dict_del (local->xattr_req, "gfid-req"); + } + ret = 0; +out: + return ret; +} + +void +afr_lookup_save_gfid (uuid_t dst, void* new, const loc_t *loc) +{ + inode_t *inode = NULL; + + inode = loc->inode; + if (inode && !uuid_is_null (inode->gfid)) + uuid_copy (dst, inode->gfid); + else if (!uuid_is_null (loc->gfid)) + uuid_copy (dst, loc->gfid); + else if (new && !uuid_is_null (new)) + uuid_copy (dst, new); } int @@ -142,60 +202,86 @@ out: return ret; } -afr_inode_ctx_t* -afr_inode_ctx_get_from_addr (uint64_t addr, int32_t child_count) +void +afr_inode_ctx_destroy (afr_inode_ctx_t *ctx) { - int ret = -1; - afr_inode_ctx_t *ctx = NULL; - size_t size = 0; + if (!ctx) + return; + GF_FREE (ctx->fresh_children); + GF_FREE (ctx); +} - GF_ASSERT (child_count > 0); +afr_inode_ctx_t* +__afr_inode_ctx_get (inode_t *inode, xlator_t *this) +{ + int ret = 0; + uint64_t ctx_addr = 0; + afr_inode_ctx_t *ctx = NULL; + afr_private_t *priv = NULL; - if (!addr) { - ctx = GF_CALLOC (1, sizeof (*ctx), - gf_afr_mt_inode_ctx_t); - if (!ctx) - goto out; - size = sizeof (*ctx->fresh_children); - ctx->fresh_children = GF_CALLOC (child_count, size, - gf_afr_mt_int32_t); - if (!ctx->fresh_children) - goto out; - } else { - ctx = (afr_inode_ctx_t*) (long) addr; + priv = this->private; + ret = __inode_ctx_get (inode, this, &ctx_addr); + if (ret < 0) + ctx_addr = 0; + if (ctx_addr != 0) { + ctx = (afr_inode_ctx_t*) (long) ctx_addr; + goto out; } - ret = 0; + ctx = GF_CALLOC (1, sizeof (*ctx), + gf_afr_mt_inode_ctx_t); + if (!ctx) + goto fail; + ctx->fresh_children = GF_CALLOC (priv->child_count, + sizeof (*ctx->fresh_children), + gf_afr_mt_int32_t); + if (!ctx->fresh_children) + goto fail; + ret = __inode_ctx_put (inode, this, (uint64_t)ctx); + if (ret) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to " + "set the inode ctx (%s)", + uuid_utoa (inode->gfid)); + goto fail; + } + out: - if (ret && ctx) { - if (ctx->fresh_children) - GF_FREE (ctx->fresh_children); - GF_FREE (ctx); - ctx = NULL; + return ctx; + +fail: + afr_inode_ctx_destroy (ctx); + return NULL; +} + +afr_inode_ctx_t* +afr_inode_ctx_get (inode_t *inode, xlator_t *this) +{ + afr_inode_ctx_t *ctx = NULL; + + LOCK (&inode->lock); + { + ctx = __afr_inode_ctx_get (inode, this); } + UNLOCK (&inode->lock); return ctx; } void -afr_inode_get_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) +afr_inode_get_ctx_params (xlator_t *this, inode_t *inode, + afr_inode_params_t *params) { GF_ASSERT (inode); GF_ASSERT (params); - int ret = 0; afr_inode_ctx_t *ctx = NULL; afr_private_t *priv = NULL; int i = 0; - uint64_t ctx_addr = 0; int32_t read_child = -1; int32_t *fresh_children = NULL; priv = this->private; LOCK (&inode->lock); { - ret = __inode_ctx_get (inode, this, &ctx_addr); - if (ret < 0) - goto unlock; - ctx = afr_inode_ctx_get_from_addr (ctx_addr, priv->child_count); + ctx = __afr_inode_ctx_get (inode, this); if (!ctx) goto unlock; switch (params->op) { @@ -210,11 +296,9 @@ afr_inode_get_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) fresh_children[i] = ctx->fresh_children[i]; break; case AFR_INODE_GET_OPENDIR_DONE: - params->u.value = ctx->masks & - AFR_ICTX_OPENDIR_DONE_MASK; - break; - case AFR_INODE_GET_SPLIT_BRAIN: - params->u.value = ctx->masks & AFR_ICTX_SPLIT_BRAIN_MASK; + params->u.value = _gf_false; + if (ctx->masks & AFR_ICTX_OPENDIR_DONE_MASK) + params->u.value = _gf_true; break; default: GF_ASSERT (0); @@ -225,14 +309,19 @@ unlock: UNLOCK (&inode->lock); } -uint64_t +gf_boolean_t afr_is_split_brain (xlator_t *this, inode_t *inode) { - afr_inode_params_t params = {0}; + afr_inode_ctx_t *ctx = NULL; + gf_boolean_t spb = _gf_false; - params.op = AFR_INODE_GET_SPLIT_BRAIN; - afr_inode_get_ctx (this, inode, ¶ms); - return params.u.value; + ctx = afr_inode_ctx_get (inode, this); + if (!ctx) + goto out; + if ((ctx->mdata_spb == SPB) || (ctx->data_spb == SPB)) + spb = _gf_true; +out: + return spb; } gf_boolean_t @@ -241,11 +330,10 @@ afr_is_opendir_done (xlator_t *this, inode_t *inode) afr_inode_params_t params = {0}; params.op = AFR_INODE_GET_OPENDIR_DONE; - afr_inode_get_ctx (this, inode, ¶ms); + afr_inode_get_ctx_params (this, inode, ¶ms); return params.u.value; } - int32_t afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children) { @@ -253,7 +341,7 @@ afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children) params.op = AFR_INODE_GET_READ_CTX; params.u.read_ctx.children = fresh_children; - afr_inode_get_ctx (this, inode, ¶ms); + afr_inode_get_ctx_params (this, inode, ¶ms); return params.u.read_ctx.read_child; } @@ -263,7 +351,6 @@ afr_inode_ctx_set_read_child (afr_inode_ctx_t *ctx, int32_t read_child) uint64_t remaining_mask = 0; uint64_t mask = 0; - GF_ASSERT (read_child >= 0); remaining_mask = (~AFR_ICTX_READ_CHILD_MASK & ctx->masks); mask = (AFR_ICTX_READ_CHILD_MASK & read_child); ctx->masks = remaining_mask | mask; @@ -285,19 +372,23 @@ afr_inode_ctx_set_read_ctx (afr_inode_ctx_t *ctx, int32_t read_child, } void -afr_inode_ctx_rm_stale_children (afr_inode_ctx_t *ctx, int32_t read_child, - int32_t *stale_children, int32_t child_count) +afr_inode_ctx_rm_stale_children (afr_inode_ctx_t *ctx, int32_t *stale_children, + int32_t child_count) { int i = 0; + int32_t read_child = -1; GF_ASSERT (stale_children); - afr_inode_ctx_set_read_child (ctx, read_child); for (i = 0; i < child_count; i++) { - if ((ctx->fresh_children[i] == -1) || (stale_children[i] == -1)) + if (stale_children[i] == -1) break; afr_children_rm_child (ctx->fresh_children, stale_children[i], child_count); } + read_child = (int32_t)(ctx->masks & AFR_ICTX_READ_CHILD_MASK); + if (!afr_is_child_present (ctx->fresh_children, child_count, + read_child)) + afr_inode_ctx_set_read_child (ctx, ctx->fresh_children[0]); } void @@ -312,31 +403,14 @@ afr_inode_ctx_set_opendir_done (afr_inode_ctx_t *ctx) } void -afr_inode_ctx_set_splitbrain (afr_inode_ctx_t *ctx, gf_boolean_t set) -{ - uint64_t remaining_mask = 0; - uint64_t mask = 0; - - if (set) { - remaining_mask = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx->masks); - mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_SPLIT_BRAIN_MASK); - ctx->masks = remaining_mask | mask; - } else { - ctx->masks = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx->masks); - } -} - -void -afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) +afr_inode_set_ctx_params (xlator_t *this, inode_t *inode, + afr_inode_params_t *params) { GF_ASSERT (inode); GF_ASSERT (params); - int ret = 0; afr_inode_ctx_t *ctx = NULL; afr_private_t *priv = NULL; - uint64_t ctx_addr = 0; - gf_boolean_t set = _gf_false; int32_t read_child = -1; int32_t *fresh_children = NULL; int32_t *stale_children = NULL; @@ -344,10 +418,7 @@ afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) priv = this->private; LOCK (&inode->lock); { - ret = __inode_ctx_get (inode, this, &ctx_addr); - if (ret < 0) - ctx_addr = 0; - ctx = afr_inode_ctx_get_from_addr (ctx_addr, priv->child_count); + ctx = __afr_inode_ctx_get (inode, this); if (!ctx) goto unlock; switch (params->op) { @@ -359,42 +430,34 @@ afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) priv->child_count); break; case AFR_INODE_RM_STALE_CHILDREN: - read_child = params->u.read_ctx.read_child; stale_children = params->u.read_ctx.children; - afr_inode_ctx_rm_stale_children (ctx, read_child, + afr_inode_ctx_rm_stale_children (ctx, stale_children, priv->child_count); break; case AFR_INODE_SET_OPENDIR_DONE: afr_inode_ctx_set_opendir_done (ctx); break; - case AFR_INODE_SET_SPLIT_BRAIN: - set = params->u.value; - afr_inode_ctx_set_splitbrain (ctx, set); - break; default: GF_ASSERT (0); break; } - ret = __inode_ctx_put (inode, this, (uint64_t)ctx); - if (ret) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to " - "set the inode ctx (%s)", - uuid_utoa (inode->gfid)); - } } unlock: UNLOCK (&inode->lock); } void -afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set) +afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, + afr_spb_state_t data_spb) { - afr_inode_params_t params = {0}; + afr_inode_ctx_t *ctx = NULL; - params.op = AFR_INODE_SET_SPLIT_BRAIN; - params.u.value = set; - afr_inode_set_ctx (this, inode, ¶ms); + ctx = afr_inode_ctx_get (inode, this); + if (mdata_spb != DONT_KNOW) + ctx->mdata_spb = mdata_spb; + if (data_spb != DONT_KNOW) + ctx->data_spb = data_spb; } void @@ -403,7 +466,7 @@ afr_set_opendir_done (xlator_t *this, inode_t *inode) afr_inode_params_t params = {0}; params.op = AFR_INODE_SET_OPENDIR_DONE; - afr_inode_set_ctx (this, inode, ¶ms); + afr_inode_set_ctx_params (this, inode, ¶ms); } void @@ -422,22 +485,20 @@ afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, params.op = AFR_INODE_SET_READ_CTX; params.u.read_ctx.read_child = read_child; params.u.read_ctx.children = fresh_children; - afr_inode_set_ctx (this, inode, ¶ms); + afr_inode_set_ctx_params (this, inode, ¶ms); } void -afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, int32_t read_child, +afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, int32_t *stale_children) { afr_inode_params_t params = {0}; - GF_ASSERT (read_child >= 0); GF_ASSERT (stale_children); params.op = AFR_INODE_RM_STALE_CHILDREN; - params.u.read_ctx.read_child = read_child; params.u.read_ctx.children = stale_children; - afr_inode_set_ctx (this, inode, ¶ms); + afr_inode_set_ctx_params (this, inode, ¶ms); } gf_boolean_t @@ -481,6 +542,10 @@ afr_is_read_child (int32_t *success_children, int32_t *sources, gf_boolean_t success_child = _gf_false; gf_boolean_t source = _gf_false; + if (child < 0) { + return _gf_false; + } + GF_ASSERT (success_children); GF_ASSERT (child_count > 0); @@ -497,29 +562,69 @@ out: return (success_child && source); } +int32_t +afr_hash_child (int32_t *success_children, int32_t child_count, + unsigned int hmode, uuid_t gfid) +{ + uuid_t gfid_copy = {0,}; + pid_t pid; + + if (!hmode) { + return -1; + } + + if (gfid) { + uuid_copy(gfid_copy,gfid); + } + if (hmode > 1) { + /* + * Why getpid? Because it's one of the cheapest calls + * available - faster than gethostname etc. - and returns a + * constant-length value that's sure to be shorter than a UUID. + * It's still very unlikely to be the same across clients, so + * it still provides good mixing. We're not trying for + * perfection here. All we need is a low probability that + * multiple clients won't converge on the same subvolume. + */ + pid = getpid(); + memcpy (gfid_copy, &pid, sizeof(pid)); + } + + return SuperFastHash((char *)gfid_copy, + sizeof(gfid_copy)) % child_count; +} + /* If sources is NULL the xattrs are assumed to be of source for all * success_children. */ int -afr_select_read_child_from_policy (int32_t *success_children, int32_t child_count, - int32_t prev_read_child, - int32_t config_read_child, int32_t *sources) +afr_select_read_child_from_policy (int32_t *success_children, + int32_t child_count, int32_t prev_read_child, + int32_t config_read_child, int32_t *sources, + unsigned int hmode, uuid_t gfid) { int32_t read_child = -1; int i = 0; GF_ASSERT (success_children); - read_child = prev_read_child; + read_child = config_read_child; if (afr_is_read_child (success_children, sources, child_count, read_child)) goto out; - read_child = config_read_child; + read_child = prev_read_child; if (afr_is_read_child (success_children, sources, child_count, read_child)) goto out; + read_child = afr_hash_child (success_children, child_count, + hmode, gfid); + if (afr_is_read_child (success_children, sources, child_count, + read_child)) { + goto out; + } + for (i = 0; i < child_count; i++) { read_child = success_children[i]; if (read_child < 0) @@ -539,7 +644,7 @@ out: void afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, int32_t *fresh_children, int32_t prev_read_child, - int32_t config_read_child) + int32_t config_read_child, uuid_t gfid) { int read_child = -1; afr_private_t *priv = NULL; @@ -549,7 +654,8 @@ afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, priv->child_count, prev_read_child, config_read_child, - NULL); + NULL, + priv->hash_mode, gfid); if (read_child >= 0) afr_inode_set_read_ctx (this, inode, read_child, fresh_children); @@ -605,8 +711,11 @@ afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, GF_ASSERT (call_child); GF_ASSERT (last_index); GF_ASSERT (fresh_children); - GF_ASSERT (read_child >= 0); + if (read_child < 0) { + ret = -EIO; + goto out; + } priv = this->private; *call_child = -1; *last_index = -1; @@ -655,80 +764,66 @@ out: } void +afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count) +{ + afr_reset_xattr (xattr, child_count); + GF_FREE (xattr); +} + +void afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) { afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; - int i = 0; - sh = &local->self_heal; priv = this->private; - if (sh->buf) - GF_FREE (sh->buf); + if (sh->data_sh_info && strcmp (sh->data_sh_info, "")) + GF_FREE (sh->data_sh_info); + + if (sh->metadata_sh_info && strcmp (sh->metadata_sh_info, "")) + GF_FREE (sh->metadata_sh_info); - if (sh->parentbufs) - GF_FREE (sh->parentbufs); + GF_FREE (sh->buf); + + GF_FREE (sh->parentbufs); if (sh->inode) inode_unref (sh->inode); - if (sh->xattr) { - afr_reset_xattr (sh->xattr, priv->child_count); - GF_FREE (sh->xattr); - } + afr_xattr_array_destroy (sh->xattr, priv->child_count); - if (sh->child_errno) - GF_FREE (sh->child_errno); + GF_FREE (sh->child_errno); - if (sh->pending_matrix) { - for (i = 0; i < priv->child_count; i++) { - GF_FREE (sh->pending_matrix[i]); - } - GF_FREE (sh->pending_matrix); - } + afr_matrix_cleanup (sh->pending_matrix, priv->child_count); + afr_matrix_cleanup (sh->delta_matrix, priv->child_count); - if (sh->delta_matrix) { - for (i = 0; i < priv->child_count; i++) { - GF_FREE (sh->delta_matrix[i]); - } - GF_FREE (sh->delta_matrix); - } - - if (sh->sources) - GF_FREE (sh->sources); + GF_FREE (sh->sources); - if (sh->success) - GF_FREE (sh->success); + GF_FREE (sh->success); - if (sh->locked_nodes) - GF_FREE (sh->locked_nodes); + GF_FREE (sh->locked_nodes); if (sh->healing_fd) { fd_unref (sh->healing_fd); sh->healing_fd = NULL; } - if (sh->linkname) - GF_FREE ((char *)sh->linkname); + GF_FREE ((char *)sh->linkname); - if (sh->success_children) - GF_FREE (sh->success_children); + GF_FREE (sh->success_children); - if (sh->fresh_children) - GF_FREE (sh->fresh_children); + GF_FREE (sh->fresh_children); - if (sh->fresh_parent_dirs) - GF_FREE (sh->fresh_parent_dirs); + GF_FREE (sh->fresh_parent_dirs); loc_wipe (&sh->parent_loc); + loc_wipe (&sh->lookup_loc); - if (sh->checksum) - GF_FREE (sh->checksum); + GF_FREE (sh->checksum); - if (sh->write_needed) - GF_FREE (sh->write_needed); + GF_FREE (sh->write_needed); if (sh->healing_fd) fd_unref (sh->healing_fd); } @@ -737,33 +832,26 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) void afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) { - int i = 0; - afr_private_t * priv = NULL; + afr_private_t *priv = NULL; + int i = 0; priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (local->pending && local->pending[i]) - GF_FREE (local->pending[i]); - } - - GF_FREE (local->pending); - - if (local->internal_lock.locked_nodes) - GF_FREE (local->internal_lock.locked_nodes); + afr_matrix_cleanup (local->pending, priv->child_count); + afr_matrix_cleanup (local->transaction.txn_changelog, + priv->child_count); - if (local->internal_lock.inode_locked_nodes) - GF_FREE (local->internal_lock.inode_locked_nodes); + GF_FREE (local->internal_lock.locked_nodes); - if (local->internal_lock.entry_locked_nodes) - GF_FREE (local->internal_lock.entry_locked_nodes); + for (i = 0; local->internal_lock.inodelk[i].domain; i++) { + GF_FREE (local->internal_lock.inodelk[i].locked_nodes); + } - if (local->internal_lock.lower_locked_nodes) - GF_FREE (local->internal_lock.lower_locked_nodes); + GF_FREE (local->internal_lock.lower_locked_nodes); + afr_entry_lockee_cleanup (&local->internal_lock); - GF_FREE (local->transaction.child_errno); - GF_FREE (local->child_errno); + GF_FREE (local->transaction.pre_op); GF_FREE (local->transaction.eager_lock); GF_FREE (local->transaction.basename); @@ -771,6 +859,8 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) loc_wipe (&local->transaction.parent_loc); loc_wipe (&local->transaction.new_parent_loc); + + GF_FREE (local->transaction.postop_piggybacked); } @@ -797,14 +887,16 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (local->xattr_req) dict_unref (local->xattr_req); - if (local->child_up) - GF_FREE (local->child_up); + if (local->dict) + dict_unref (local->dict); - if (local->fresh_children) - GF_FREE (local->fresh_children); + GF_FREE(local->replies); - if (local->fd_open_on) - GF_FREE (local->fd_open_on); + GF_FREE (local->child_up); + + GF_FREE (local->child_errno); + + GF_FREE (local->fresh_children); { /* lookup */ if (local->cont.lookup.xattrs) { @@ -822,27 +914,23 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) inode_unref (local->cont.lookup.inode); } - if (local->cont.lookup.postparents) - GF_FREE (local->cont.lookup.postparents); + GF_FREE (local->cont.lookup.postparents); - if (local->cont.lookup.bufs) - GF_FREE (local->cont.lookup.bufs); + GF_FREE (local->cont.lookup.bufs); - if (local->cont.lookup.success_children) - GF_FREE (local->cont.lookup.success_children); + GF_FREE (local->cont.lookup.success_children); - if (local->cont.lookup.sources) - GF_FREE (local->cont.lookup.sources); + GF_FREE (local->cont.lookup.sources); + afr_matrix_cleanup (local->cont.lookup.pending_matrix, + priv->child_count); } { /* getxattr */ - if (local->cont.getxattr.name) - GF_FREE (local->cont.getxattr.name); + GF_FREE (local->cont.getxattr.name); } { /* lk */ - if (local->cont.lk.locked_nodes) - GF_FREE (local->cont.lk.locked_nodes); + GF_FREE (local->cont.lk.locked_nodes); } { /* create */ @@ -876,18 +964,40 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) dict_unref (local->cont.setxattr.dict); } + { /* fsetxattr */ + if (local->cont.fsetxattr.dict) + dict_unref (local->cont.fsetxattr.dict); + } + { /* removexattr */ GF_FREE (local->cont.removexattr.name); } - + { /* xattrop */ + if (local->cont.xattrop.xattr) + dict_unref (local->cont.xattrop.xattr); + } + { /* fxattrop */ + if (local->cont.fxattrop.xattr) + dict_unref (local->cont.fxattrop.xattr); + } { /* symlink */ GF_FREE (local->cont.symlink.linkpath); } { /* opendir */ - if (local->cont.opendir.checksum) - GF_FREE (local->cont.opendir.checksum); + GF_FREE (local->cont.opendir.checksum); } + + { /* readdirp */ + if (local->cont.readdir.dict) + dict_unref (local->cont.readdir.dict); + } + + if (local->xdata_req) + dict_unref (local->xdata_req); + + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); } @@ -936,6 +1046,13 @@ afr_locked_children_count (unsigned char *children, unsigned int child_count) return afr_set_elem_count_get (children, child_count); } +unsigned int +afr_pre_op_done_children_count (unsigned char *pre_op, + unsigned int child_count) +{ + return afr_set_elem_count_get (pre_op, child_count); +} + gf_boolean_t afr_is_fresh_lookup (loc_t *loc, xlator_t *this) { @@ -963,33 +1080,144 @@ afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) uuid_copy (loc->pargfid, postparent->ia_gfid); } +/* + * Quota size xattrs are not maintained by afr. There is a + * possibility that they differ even when both the directory changelog xattrs + * suggest everything is fine. So if there is at least one 'source' check among + * the sources which has the maximum quota size. Otherwise check among all the + * available ones for maximum quota size. This way if there is a source and + * stale copies it always votes for the 'source'. + * */ + +static void +afr_handle_quota_size (afr_local_t *local, xlator_t *this, + dict_t *rsp_dict) +{ + int32_t *sources = NULL; + dict_t *xattr = NULL; + data_t *max_data = NULL; + int64_t max_quota_size = -1; + data_t *data = NULL; + int64_t *size = NULL; + int64_t quota_size = -1; + afr_private_t *priv = NULL; + int i = 0; + int ret = -1; + gf_boolean_t source_present = _gf_false; + + priv = this->private; + sources = local->cont.lookup.sources; + + if (rsp_dict == NULL) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "%s: Invalid " + "response dictionary", local->loc.path); + return; + } + + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source_present = _gf_true; + break; + } + } + + for (i = 0; i < priv->child_count; i++) { + /* + * If there is at least one source lets check + * for maximum quota sizes among sources, otherwise take the + * maximum of the ones present to be on the safer side. + */ + if (source_present && !sources[i]) + continue; + + xattr = local->cont.lookup.xattrs[i]; + if (!xattr) + continue; + + data = dict_get (xattr, QUOTA_SIZE_KEY); + if (!data) + continue; + + size = (int64_t*)data->data; + quota_size = ntoh64(*size); + gf_log (this->name, GF_LOG_DEBUG, "%s: %d, size: %"PRId64, + local->loc.path, i, quota_size); + if (quota_size > max_quota_size) { + if (max_data) + data_unref (max_data); + + max_quota_size = quota_size; + max_data = data_ref (data); + } + } + + if (max_data) { + ret = dict_set (rsp_dict, QUOTA_SIZE_KEY, max_data); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " + "quota size", local->loc.path); + } + + data_unref (max_data); + } +} + int afr_lookup_build_response_params (afr_local_t *local, xlator_t *this) { - int32_t read_child = -1; struct iatt *buf = NULL; struct iatt *postparent = NULL; dict_t **xattr = NULL; + int32_t *success_children = NULL; + int32_t *sources = NULL; + afr_private_t *priv = NULL; + int32_t read_child = -1; int ret = 0; + int i = 0; GF_ASSERT (local); buf = &local->cont.lookup.buf; postparent = &local->cont.lookup.postparent; xattr = &local->cont.lookup.xattr; + priv = this->private; read_child = afr_inode_get_read_ctx (this, local->cont.lookup.inode, - NULL); + local->fresh_children); + if (read_child < 0) { + ret = -1; + goto out; + } + success_children = local->cont.lookup.success_children; + sources = local->cont.lookup.sources; + memset (sources, 0, sizeof (*sources) * priv->child_count); + afr_children_intersection_get (local->fresh_children, success_children, + sources, priv->child_count); + if (!sources[read_child]) { + read_child = -1; + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + read_child = i; + break; + } + } + } if (read_child < 0) { ret = -1; goto out; } + gf_log (this->name, GF_LOG_DEBUG, "Building lookup response from %d", read_child); - *xattr = dict_ref (local->cont.lookup.xattrs[read_child]); + if (!*xattr) + *xattr = dict_ref (local->cont.lookup.xattrs[read_child]); + *buf = local->cont.lookup.bufs[read_child]; *postparent = local->cont.lookup.postparents[read_child]; + if (dict_get (local->xattr_req, QUOTA_SIZE_KEY)) + afr_handle_quota_size (local, this, *xattr); + if (IA_INVAL == local->cont.lookup.inode->ia_type) { /* fix for RT #602 */ local->cont.lookup.inode->ia_type = buf->ia_type; @@ -1005,6 +1233,7 @@ afr_lookup_update_lk_counts (afr_local_t *local, xlator_t *this, uint32_t inodelk_count = 0; uint32_t entrylk_count = 0; int ret = -1; + uint32_t parent_entrylk = 0; GF_ASSERT (local); GF_ASSERT (this); @@ -1020,43 +1249,103 @@ afr_lookup_update_lk_counts (afr_local_t *local, xlator_t *this, &entrylk_count); if (ret == 0) local->entrylk_count += entrylk_count; + ret = dict_get_uint32 (xattr, GLUSTERFS_PARENT_ENTRYLK, + &parent_entrylk); + if (!ret) + local->cont.lookup.parent_entrylk += parent_entrylk; } +/* + * It's important to maintain a commutative property on do_*_self_heal and + * found*; once set, they must not be cleared by a subsequent iteration or + * call, so that they represent a logical OR of all iterations and calls + * regardless of child/key order. That allows the caller to call us multiple + * times without having to use a separate variable as a "reduce" accumulator. + */ static void afr_lookup_set_self_heal_params_by_xattr (afr_local_t *local, xlator_t *this, dict_t *xattr) { + afr_private_t *priv = NULL; + int i = 0; + int ret = -1; + void *pending_raw = NULL; + int32_t *pending = NULL; + GF_ASSERT (local); GF_ASSERT (this); GF_ASSERT (xattr); - if (afr_sh_has_metadata_pending (xattr, this)) { - local->self_heal.do_metadata_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "metadata self-heal is pending for %s.", - local->loc.path); - } + priv = this->private; - if (afr_sh_has_entry_pending (xattr, this)) { - local->self_heal.do_entry_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "entry self-heal is pending for %s.", local->loc.path); - } + for (i = 0; i < priv->child_count; i++) { + ret = dict_get_ptr (xattr, priv->pending_key[i], + &pending_raw); + if (ret != 0) { + continue; + } + pending = pending_raw; - if (afr_sh_has_data_pending (xattr, this)) { - local->self_heal.do_data_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "data self-heal is pending for %s.", local->loc.path); + if (pending[AFR_METADATA_TRANSACTION]) { + gf_log(this->name, GF_LOG_DEBUG, + "metadata self-heal is pending for %s.", + local->loc.path); + local->self_heal.do_metadata_self_heal = _gf_true; + } + + if (pending[AFR_ENTRY_TRANSACTION]) { + gf_log(this->name, GF_LOG_DEBUG, + "entry self-heal is pending for %s.", + local->loc.path); + local->self_heal.do_entry_self_heal = _gf_true; + } + + if (pending[AFR_DATA_TRANSACTION]) { + gf_log(this->name, GF_LOG_DEBUG, + "data self-heal is pending for %s.", + local->loc.path); + local->self_heal.do_data_self_heal = _gf_true; + } } } +void +afr_lookup_check_set_metadata_split_brain (afr_local_t *local, xlator_t *this) +{ + int32_t *sources = NULL; + afr_private_t *priv = NULL; + int32_t subvol_status = 0; + int32_t *success_children = NULL; + dict_t **xattrs = NULL; + struct iatt *bufs = NULL; + int32_t **pending_matrix = NULL; + + priv = this->private; + + sources = GF_CALLOC (priv->child_count, sizeof (*sources), + gf_afr_mt_int32_t); + if (NULL == sources) + goto out; + success_children = local->cont.lookup.success_children; + xattrs = local->cont.lookup.xattrs; + bufs = local->cont.lookup.bufs; + pending_matrix = local->cont.lookup.pending_matrix; + afr_build_sources (this, xattrs, bufs, pending_matrix, + sources, success_children, AFR_METADATA_TRANSACTION, + &subvol_status, _gf_false); + if (subvol_status & SPLIT_BRAIN) + local->cont.lookup.possible_spb = _gf_true; +out: + GF_FREE (sources); +} + static void afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this, struct iatt *buf, struct iatt *lookup_buf) { if (PERMISSION_DIFFERS (buf, lookup_buf)) { /* mismatching permissions */ - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_DEBUG, "permissions differ for %s ", local->loc.path); local->self_heal.do_metadata_self_heal = _gf_true; } @@ -1064,27 +1353,45 @@ afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this, if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { /* mismatching permissions */ local->self_heal.do_metadata_self_heal = _gf_true; - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_DEBUG, "ownership differs for %s ", local->loc.path); } if (SIZE_DIFFERS (buf, lookup_buf) && IA_ISREG (buf->ia_type)) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_DEBUG, "size differs for %s ", local->loc.path); local->self_heal.do_data_self_heal = _gf_true; } if (uuid_compare (buf->ia_gfid, lookup_buf->ia_gfid)) { /* mismatching gfid */ - gf_log (this->name, GF_LOG_WARNING, + gf_log (this->name, GF_LOG_DEBUG, "%s: gfid different on subvolume", local->loc.path); } } static void -afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this, - gf_boolean_t split_brain) +afr_detect_self_heal_by_split_brain_status (afr_local_t *local, xlator_t *this) +{ + gf_boolean_t split_brain = _gf_false; + afr_self_heal_t *sh = NULL; + + sh = &local->self_heal; + + split_brain = afr_is_split_brain (this, local->cont.lookup.inode); + split_brain = split_brain || local->cont.lookup.possible_spb; + if ((local->success_count > 0) && split_brain && + IA_ISREG (local->cont.lookup.inode->ia_type)) { + sh->force_confirm_spb = _gf_true; + gf_log (this->name, GF_LOG_DEBUG, + "split brain detected during lookup of %s.", + local->loc.path); + } +} + +static void +afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this) { GF_ASSERT (local); GF_ASSERT (this); @@ -1095,24 +1402,11 @@ afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this, local->self_heal.do_entry_self_heal = _gf_true; local->self_heal.do_gfid_self_heal = _gf_true; local->self_heal.do_missing_entry_self_heal = _gf_true; - gf_log(this->name, GF_LOG_INFO, + gf_log(this->name, GF_LOG_DEBUG, "entries are missing in lookup of %s.", local->loc.path); - //If all self-heals are needed no need to check for other rules - goto out; - } - - if ((local->success_count > 0) && split_brain && - IA_ISREG (local->cont.lookup.inode->ia_type)) { - local->self_heal.do_data_self_heal = _gf_true; - local->self_heal.do_gfid_self_heal = _gf_true; - local->self_heal.do_missing_entry_self_heal = _gf_true; - gf_log (this->name, GF_LOG_WARNING, - "split brain detected during lookup of %s.", - local->loc.path); } -out: return; } @@ -1122,6 +1416,8 @@ afr_can_self_heal_proceed (afr_self_heal_t *sh, afr_private_t *priv) GF_ASSERT (sh); GF_ASSERT (priv); + if (sh->force_confirm_spb) + return _gf_true; return (sh->do_gfid_self_heal || sh->do_missing_entry_self_heal || (afr_data_self_heal_enabled (priv->data_self_heal) && @@ -1155,6 +1451,7 @@ afr_lookup_select_read_child (afr_local_t *local, xlator_t *this, dict_t **xattrs = NULL; int32_t *success_children = NULL; afr_transaction_type type = AFR_METADATA_TRANSACTION; + uuid_t *gfid = NULL; GF_ASSERT (local); GF_ASSERT (this); @@ -1168,8 +1465,9 @@ afr_lookup_select_read_child (afr_local_t *local, xlator_t *this, ia_type = local->cont.lookup.bufs[success_children[0]].ia_type; type = afr_transaction_type_get (ia_type); xattrs = local->cont.lookup.xattrs; + gfid = &local->cont.lookup.buf.ia_gfid; source = afr_lookup_select_read_child_by_txn_type (this, local, xattrs, - type); + type, *gfid); if (source < 0) { gf_log (this->name, GF_LOG_DEBUG, "failed to select source " "for %s", local->loc.path); @@ -1197,7 +1495,8 @@ afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this), int (*unwind) (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno)) + int32_t op_ret, int32_t op_errno, + int32_t sh_failed)) { afr_local_t *local = NULL; char sh_type_str[256] = {0,}; @@ -1220,19 +1519,19 @@ afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, if (background) bg = "background"; - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_DEBUG, "%s %s self-heal triggered. path: %s, reason: %s", bg, sh_type_str, local->loc.path, reason); afr_self_heal (frame, this, inode); } -int +unsigned int afr_gfid_missing_count (const char *xlator_name, int32_t *success_children, struct iatt *bufs, unsigned int child_count, const char *path) { - int gfid_miss_count = 0; + unsigned int gfid_miss_count = 0; int i = 0; struct iatt *child1 = NULL; @@ -1291,7 +1590,7 @@ afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, child2 = &bufs[success_children[i-1]]; if (FILETYPE_DIFFERS (child1, child2)) { - gf_log (xlator_name, GF_LOG_WARNING, "%s: filetype " + gf_log (xlator_name, GF_LOG_DEBUG, "%s: filetype " "differs on subvolumes (%d, %d)", path, success_children[i-1], success_children[i]); conflicting = _gf_true; @@ -1300,7 +1599,7 @@ afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, if (!gfid || uuid_is_null (child1->ia_gfid)) continue; if (uuid_compare (*gfid, child1->ia_gfid)) { - gf_log (xlator_name, GF_LOG_WARNING, "%s: gfid differs" + gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid differs" " on subvolume %d", path, success_children[i]); conflicting = _gf_true; goto out; @@ -1383,13 +1682,11 @@ afr_lookup_set_self_heal_params (afr_local_t *local, xlator_t *this) int32_t child1 = -1; int32_t child2 = -1; afr_self_heal_t *sh = NULL; - gf_boolean_t split_brain = _gf_false; priv = this->private; sh = &local->self_heal; - split_brain = afr_is_split_brain (this, local->cont.lookup.inode); - afr_detect_self_heal_by_lookup_status (local, this, split_brain); + afr_detect_self_heal_by_lookup_status (local, this); if (afr_lookup_gfid_missing_count (local, this)) local->self_heal.do_gfid_self_heal = _gf_true; @@ -1416,23 +1713,28 @@ afr_lookup_set_self_heal_params (afr_local_t *local, xlator_t *this) afr_lookup_set_self_heal_params_by_xattr (local, this, xattr[child1]); } - if (afr_open_only_data_self_heal (priv->data_self_heal) - && !split_brain) + if (afr_open_only_data_self_heal (priv->data_self_heal)) sh->do_data_self_heal = _gf_false; + if (sh->do_metadata_self_heal) + afr_lookup_check_set_metadata_split_brain (local, this); + afr_detect_self_heal_by_split_brain_status (local, this); } int afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, + int32_t sh_failed) { afr_local_t *local = NULL; + int ret = -1; + dict_t *xattr = NULL; local = frame->local; if (op_ret == -1) { local->op_ret = -1; - if (afr_error_more_important (local->op_errno, op_errno)) - local->op_errno = op_errno; + local->op_errno = afr_most_important_error(local->op_errno, + op_errno, _gf_true); goto out; } else { @@ -1440,6 +1742,23 @@ afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, } afr_lookup_done_success_action (frame, this, _gf_true); + xattr = local->cont.lookup.xattr; + if (xattr) { + ret = dict_set_int32 (xattr, "sh-failed", sh_failed); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " + "sh-failed to %d", local->loc.path, sh_failed); + + if (local->self_heal.actual_sh_started == _gf_true && + sh_failed == 0) { + ret = dict_set_int32 (xattr, "actual-sh-done", 1); + if (ret) + gf_log(this->name, GF_LOG_ERROR, "%s: Failed to" + " set actual-sh-done to %d", + local->loc.path, + local->self_heal.actual_sh_started); + } + } out: AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, local->cont.lookup.inode, &local->cont.lookup.buf, @@ -1513,7 +1832,8 @@ afr_lookup_perform_self_heal (call_frame_t *frame, xlator_t *this, afr_lookup_set_self_heal_params (local, this); if (afr_can_self_heal_proceed (&local->self_heal, priv)) { - if (afr_is_transaction_running (local)) + if (afr_is_transaction_running (local) && + (!local->allow_sh_for_running_transaction)) goto out; reason = "lookup detected pending operations"; @@ -1574,26 +1894,23 @@ afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, int32_t read_child = -1; int32_t ret = -1; afr_local_t *local = NULL; - afr_private_t *priv = NULL; + gf_boolean_t fresh_lookup = _gf_false; local = frame->local; - priv = this->private; + fresh_lookup = local->cont.lookup.fresh_lookup; if (local->loc.parent == NULL) fail_conflict = _gf_true; - if (afr_conflicting_iattrs (local->cont.lookup.bufs, - local->cont.lookup.success_children, - priv->child_count, local->loc.path, - this->name)) { + if (afr_lookup_conflicting_entries (local, this)) { if (fail_conflict == _gf_false) ret = 0; goto out; } - if (!afr_is_transaction_running (local)) { - ret = afr_lookup_select_read_child (local, this, &read_child); - if (ret) + ret = afr_lookup_select_read_child (local, this, &read_child); + if (!afr_is_transaction_running (local) || fresh_lookup) { + if (read_child < 0) goto out; ret = afr_lookup_set_read_ctx (local, this, read_child); @@ -1604,11 +1921,9 @@ afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, ret = afr_lookup_build_response_params (local, this); if (ret) goto out; - if (afr_is_fresh_lookup (&local->loc, this)) { - afr_update_loc_gfids (&local->loc, - &local->cont.lookup.buf, - &local->cont.lookup.postparent); - } + afr_update_loc_gfids (&local->loc, + &local->cont.lookup.buf, + &local->cont.lookup.postparent); ret = 0; out: @@ -1619,6 +1934,135 @@ out: return ret; } +int +afr_lookup_get_latest_subvol (afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = NULL; + int32_t *success_children = NULL; + struct iatt *bufs = NULL; + int i = 0; + int child = 0; + int lsubvol = -1; + + priv = this->private; + success_children = local->cont.lookup.success_children; + bufs = local->cont.lookup.bufs; + for (i = 0; i < priv->child_count; i++) { + child = success_children[i]; + if (child == -1) + break; + if (uuid_is_null (bufs[child].ia_gfid)) + continue; + if (lsubvol < 0) { + lsubvol = child; + } else if (bufs[lsubvol].ia_ctime < bufs[child].ia_ctime) { + lsubvol = child; + } else if ((bufs[lsubvol].ia_ctime == bufs[child].ia_ctime) && + (bufs[lsubvol].ia_ctime_nsec < bufs[child].ia_ctime_nsec)) { + lsubvol = child; + } + } + return lsubvol; +} + +void +afr_lookup_mark_other_entries_stale (afr_local_t *local, xlator_t *this, + int subvol) +{ + afr_private_t *priv = NULL; + int32_t *success_children = NULL; + struct iatt *bufs = NULL; + int i = 0; + int child = 0; + + priv = this->private; + success_children = local->cont.lookup.success_children; + bufs = local->cont.lookup.bufs; + memcpy (local->fresh_children, success_children, + sizeof (*success_children) * priv->child_count); + for (i = 0; i < priv->child_count; i++) { + child = local->fresh_children[i]; + if (child == -1) + break; + if (child == subvol) + continue; + if (uuid_is_null (bufs[child].ia_gfid) && + (bufs[child].ia_type == bufs[subvol].ia_type)) + continue; + afr_children_rm_child (success_children, child, + priv->child_count); + local->success_count--; + } + afr_reset_children (local->fresh_children, priv->child_count); +} + +void +afr_succeed_lookup_on_latest_iatt (afr_local_t *local, xlator_t *this) +{ + int lsubvol = 0; + + if (!afr_lookup_conflicting_entries (local, this)) + goto out; + + lsubvol = afr_lookup_get_latest_subvol (local, this); + if (lsubvol < 0) + goto out; + afr_lookup_mark_other_entries_stale (local, this, lsubvol); +out: + return; +} + +gf_boolean_t +afr_is_entry_possibly_under_creation (afr_local_t *local, xlator_t *this) +{ + /* + * We need to perform this test in lookup done and treat on going + * create/DELETE as ENOENT. + * Reason: + Multiple clients A, B and C are attempting 'mkdir -p /mnt/a/b/c' + + 1 Client A is in the middle of mkdir(/a). It has acquired lock. + It has performed mkdir(/a) on one subvol, and second one is still + in progress + 2 Client B performs a lookup, sees directory /a on one, + ENOENT on the other, succeeds lookup. + 3 Client B performs lookup on /a/b on both subvols, both return ENOENT + (one subvol because /a/b does not exist, another because /a + itself does not exist) + 4 Client B proceeds to mkdir /a/b. It obtains entrylk on inode=/a with + basename=b on one subvol, but fails on other subvol as /a is yet to + be created by Client A. + 5 Client A finishes mkdir of /a on other subvol + 6 Client C also attempts to create /a/b, lookup returns ENOENT on + both subvols. + 7 Client C tries to obtain entrylk on on inode=/a with basename=b, + obtains on one subvol (where B had failed), and waits for B to unlock + on other subvol. + 8 Client B finishes mkdir() on one subvol with GFID-1 and completes + transaction and unlocks + 9 Client C gets the lock on the second subvol, At this stage second + subvol already has /a/b created from Client B, but Client C does not + check that in the middle of mkdir transaction + 10 Client C attempts mkdir /a/b on both subvols. It succeeds on + ONLY ONE (where Client B could not get lock because of + missing parent /a dir) with GFID-2, and gets EEXIST from ONE subvol. + This way we have /a/b in GFID mismatch. One subvol got GFID-1 because + Client B performed transaction on only one subvol (because entrylk() + could not be obtained on second subvol because of missing parent dir -- + caused by premature/speculative succeeding of lookup() on /a when locks + are detected). Other subvol gets GFID-2 from Client C because while + it was waiting for entrylk() on both subvols, Client B was in the + middle of creating mkdir() on only one subvol, and Client C does not + "expect" this when it is between lock() and pre-op()/op() phase of the + transaction. + */ + if (local->cont.lookup.parent_entrylk && local->enoent_count) + return _gf_true; + + return _gf_false; +} + + static void afr_lookup_done (call_frame_t *frame, xlator_t *this) { @@ -1627,6 +2071,7 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; int ret = -1; gf_boolean_t sh_launched = _gf_false; + gf_boolean_t fail_conflict = _gf_false; int gfid_miss_count = 0; int enotconn_count = 0; int up_children_count = 0; @@ -1634,8 +2079,18 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) priv = this->private; local = frame->local; + if (afr_is_entry_possibly_under_creation (local, this)) { + local->op_ret = -1; + local->op_errno = ENOENT; + goto unwind; + } + if (local->op_ret < 0) goto unwind; + + if (local->cont.lookup.parent_entrylk && local->success_count > 1) + afr_succeed_lookup_on_latest_iatt (local, this); + gfid_miss_count = afr_lookup_gfid_missing_count (local, this); up_children_count = afr_up_children_count (local->child_up, priv->child_count); @@ -1650,7 +2105,18 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) goto unwind; } - ret = afr_lookup_done_success_action (frame, this, _gf_false); + if ((gfid_miss_count == local->success_count) && + uuid_is_null (local->cont.lookup.gfid_req)) { + local->op_ret = -1; + local->op_errno = ENODATA; + gf_log (this->name, GF_LOG_ERROR, "%s: No gfid present", + local->loc.path); + goto unwind; + } + + if (gfid_miss_count && uuid_is_null (local->cont.lookup.gfid_req)) + fail_conflict = _gf_true; + ret = afr_lookup_done_success_action (frame, this, fail_conflict); if (ret) goto unwind; uuid_copy (local->self_heal.sh_gfid_req, local->cont.lookup.gfid_req); @@ -1676,24 +2142,20 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) * others in that they must be given higher priority while * returning to the user. * - * The hierarchy is ESTALE > ENOENT > others - * + * The hierarchy is ESTALE > EIO > ENOENT > others */ - -gf_boolean_t -afr_error_more_important (int32_t old_errno, int32_t new_errno) +int32_t +afr_most_important_error(int32_t old_errno, int32_t new_errno, + gf_boolean_t eio) { - gf_boolean_t ret = _gf_true; - - /* Nothing should ever overwrite ESTALE */ - if (old_errno == ESTALE) - ret = _gf_false; - - /* Nothing should overwrite ENOENT, except ESTALE */ - else if ((old_errno == ENOENT) && (new_errno != ESTALE)) - ret = _gf_false; - - return ret; + if (old_errno == ESTALE || new_errno == ESTALE) + return ESTALE; + if (eio && (old_errno == EIO || new_errno == EIO)) + return EIO; + if (old_errno == ENOENT || new_errno == ENOENT) + return ENOENT; + + return new_errno; } int32_t @@ -1712,8 +2174,9 @@ afr_resultant_errno_get (int32_t *children, } else { child = i; } - if (afr_error_more_important (op_errno, child_errno[child])) - op_errno = child_errno[child]; + op_errno = afr_most_important_error(op_errno, + child_errno[child], + _gf_false); } return op_errno; } @@ -1725,8 +2188,8 @@ afr_lookup_handle_error (afr_local_t *local, int32_t op_ret, int32_t op_errno) if (op_errno == ENOENT) local->enoent_count++; - if (afr_error_more_important (local->op_errno, op_errno)) - local->op_errno = op_errno; + local->op_errno = afr_most_important_error(local->op_errno, op_errno, + _gf_false); if (local->op_errno == ESTALE) { local->op_ret = -1; @@ -1740,7 +2203,7 @@ afr_set_root_inode_on_first_lookup (afr_local_t *local, xlator_t *this, afr_private_t *priv = NULL; GF_ASSERT (inode); - if (inode->ino != 1) + if (!__is_root_gfid (inode->gfid)) goto out; if (!afr_is_fresh_lookup (&local->loc, this)) goto out; @@ -1773,12 +2236,79 @@ afr_lookup_handle_first_success (afr_local_t *local, xlator_t *this, afr_set_root_inode_on_first_lookup (local, this, inode); } +static int32_t +afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + int ret = 0; + char *pathinfo = NULL; + gf_boolean_t is_local = _gf_false; + afr_private_t *priv = NULL; + int32_t child_index = -1; + + if (op_ret != 0) { + goto out; + } + + ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); + if (ret != 0) { + goto out; + } + + ret = afr_local_pathinfo (pathinfo, &is_local); + if (ret) { + goto out; + } + + priv = this->private; + /* + * Note that one local subvolume will override another here. The only + * way to avoid that would be to retain extra information about whether + * the previous read_child is local, and it's just not worth it. Even + * the slowest local subvolume is far preferable to a remote one. + */ + if (is_local) { + child_index = (int32_t)(long)cookie; + gf_log (this->name, GF_LOG_INFO, + "selecting local read_child %s", + priv->children[child_index]->name); + priv->read_child = child_index; + } + +out: + STACK_DESTROY(frame->root); + return 0; +} + +static void +afr_attempt_local_discovery (xlator_t *this, int32_t child_index) +{ + call_frame_t *newframe = NULL; + loc_t tmploc = {0,}; + afr_private_t *priv = this->private; + + newframe = create_frame(this,this->ctx->pool); + if (!newframe) { + return; + } + + tmploc.gfid[sizeof(tmploc.gfid)-1] = 1; + STACK_WIND_COOKIE (newframe, afr_discovery_cbk, + (void *)(long)child_index, + priv->children[child_index], + priv->children[child_index]->fops->getxattr, + &tmploc, GF_XATTR_PATHINFO_KEY, NULL); +} + static void afr_lookup_handle_success (afr_local_t *local, xlator_t *this, int32_t child_index, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xattr, struct iatt *postparent) { + afr_private_t *priv = this->private; + if (local->success_count == 0) { if (local->op_errno != ESTALE) { local->op_ret = op_ret; @@ -1791,6 +2321,11 @@ afr_lookup_handle_success (afr_local_t *local, xlator_t *this, int32_t child_ind afr_lookup_cache_args (local, child_index, xattr, buf, postparent); + + if (local->do_discovery && (priv->read_child == (-1))) { + afr_attempt_local_discovery(this,child_index); + } + local->cont.lookup.success_children[local->success_count] = child_index; local->success_count++; } @@ -1837,6 +2372,8 @@ afr_lookup_cont_init (afr_local_t *local, unsigned int child_count) int ret = -ENOMEM; struct iatt *iatts = NULL; int32_t *success_children = NULL; + int32_t *sources = NULL; + int32_t **pending_matrix = NULL; GF_ASSERT (local); local->cont.lookup.xattrs = GF_CALLOC (child_count, @@ -1864,6 +2401,16 @@ afr_lookup_cont_init (afr_local_t *local, unsigned int child_count) if (NULL == local->fresh_children) goto out; + sources = GF_CALLOC (sizeof (*sources), child_count, gf_afr_mt_int32_t); + if (NULL == sources) + goto out; + local->cont.lookup.sources = sources; + + pending_matrix = afr_matrix_create (child_count, child_count); + if (NULL == pending_matrix) + goto out; + local->cont.lookup.pending_matrix = pending_matrix; + ret = 0; out: return ret; @@ -1881,42 +2428,53 @@ afr_lookup (call_frame_t *frame, xlator_t *this, int call_count = 0; uint64_t ctx = 0; int32_t op_errno = 0; - + int allow_sh = 0; priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (local, out); local->op_ret = -1; frame->local = local; local->fop = GF_FOP_LOOKUP; - if (!strcmp (loc->path, "/" GF_REPLICATE_TRASH_DIR)) { - op_errno = ENOENT; + loc_copy (&local->loc, loc); + ret = loc_path (&local->loc, NULL); + if (ret < 0) { + op_errno = EINVAL; goto out; } - loc_copy (&local->loc, loc); + if (local->loc.path && + (strcmp (local->loc.path, "/" GF_REPLICATE_TRASH_DIR) == 0)) { + op_errno = EPERM; + ret = -1; + goto out; + } - ret = inode_ctx_get (loc->inode, this, &ctx); + ret = inode_ctx_get (local->loc.inode, this, &ctx); if (ret == 0) { /* lookup is a revalidate */ local->read_child_index = afr_inode_get_read_ctx (this, - loc->inode, - NULL); + local->loc.inode, + NULL); } else { LOCK (&priv->read_child_lock); { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); + if (priv->hash_mode) { + local->read_child_index = -1; + } + else { + local->read_child_index = + (++priv->read_child_rr) % + (priv->child_count); + } } UNLOCK (&priv->read_child_lock); + local->cont.lookup.fresh_lookup = _gf_true; } - if (loc->parent) - local->cont.lookup.parent_ino = loc->parent->ino; - local->child_up = memdup (priv->child_up, sizeof (*local->child_up) * priv->child_count); if (NULL == local->child_up) { @@ -1933,7 +2491,6 @@ afr_lookup (call_frame_t *frame, xlator_t *this, local->call_count = afr_up_children_count (local->child_up, priv->child_count); call_count = local->call_count; - if (local->call_count == 0) { ret = -1; op_errno = ENOTCONN; @@ -1943,42 +2500,33 @@ afr_lookup (call_frame_t *frame, xlator_t *this, /* By default assume ENOTCONN. On success it will be set to 0. */ local->op_errno = ENOTCONN; - if (xattr_req == NULL) - local->xattr_req = dict_new (); - else - local->xattr_req = dict_ref (xattr_req); + ret = dict_get_int32 (xattr_req, "allow-sh-for-running-transaction", + &allow_sh); + dict_del (xattr_req, "allow-sh-for-running-transaction"); + local->allow_sh_for_running_transaction = allow_sh; - afr_xattr_req_prepare (this, local->xattr_req, loc->path); - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_INODELK_COUNT); - } - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_ENTRYLK_COUNT); - } - - ret = dict_get_ptr (local->xattr_req, "gfid-req", &gfid_req); + ret = afr_lookup_xattr_req_prepare (local, this, xattr_req, &local->loc, + &gfid_req); if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to get the gfid from dict"); - } else { - uuid_copy (local->cont.lookup.gfid_req, gfid_req); - if (local->loc.parent) - dict_del (local->xattr_req, "gfid-req"); + local->op_errno = -ret; + goto out; + } + afr_lookup_save_gfid (local->cont.lookup.gfid_req, gfid_req, + &local->loc); + local->fop = GF_FOP_LOOKUP; + if (priv->choose_local && !priv->did_discovery) { + if (gfid_req && __is_root_gfid(gfid_req)) { + local->do_discovery = _gf_true; + priv->did_discovery = _gf_true; + } } - for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, afr_lookup_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->lookup, - loc, local->xattr_req); + &local->loc, local->xattr_req); if (!--call_count) break; } @@ -1986,7 +2534,7 @@ afr_lookup (call_frame_t *frame, xlator_t *this, ret = 0; out: - if (ret == -1) + if (ret) AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); @@ -1997,7 +2545,7 @@ out: /* {{{ open */ int -afr_fd_ctx_set (xlator_t *this, fd_t *fd) +__afr_fd_ctx_set (xlator_t *this, fd_t *fd) { afr_private_t * priv = NULL; int ret = -1; @@ -2009,220 +2557,167 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd) priv = this->private; - LOCK (&fd->lock); - { - ret = __fd_ctx_get (fd, this, &ctx); + ret = __fd_ctx_get (fd, this, &ctx); - if (ret == 0) - goto unlock; + if (ret == 0) + goto out; - fd_ctx = GF_CALLOC (1, sizeof (afr_fd_ctx_t), - gf_afr_mt_afr_fd_ctx_t); - if (!fd_ctx) { - ret = -ENOMEM; - goto unlock; - } + fd_ctx = GF_CALLOC (1, sizeof (afr_fd_ctx_t), + gf_afr_mt_afr_fd_ctx_t); + if (!fd_ctx) { + ret = -ENOMEM; + goto out; + } - fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_done) { - ret = -ENOMEM; - goto unlock; - } + fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->pre_op_done) { + ret = -ENOMEM; + goto out; + } - fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_piggyback) { - ret = -ENOMEM; - goto unlock; - } + fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->pre_op_piggyback) { + ret = -ENOMEM; + goto out; + } - fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), - priv->child_count, - gf_afr_mt_int32_t); - if (!fd_ctx->opened_on) { - ret = -ENOMEM; - goto unlock; - } + fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), + priv->child_count, + gf_afr_mt_int32_t); + if (!fd_ctx->opened_on) { + ret = -ENOMEM; + goto out; + } - fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->lock_piggyback) { - ret = -ENOMEM; - goto unlock; - } + fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->lock_piggyback) { + ret = -ENOMEM; + goto out; + } - fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->lock_acquired) { - ret = -ENOMEM; - goto unlock; - } + fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->lock_acquired) { + ret = -ENOMEM; + goto out; + } - fd_ctx->up_count = priv->up_count; - fd_ctx->down_count = priv->down_count; + fd_ctx->up_count = priv->up_count; + fd_ctx->down_count = priv->down_count; - fd_ctx->locked_on = GF_CALLOC (sizeof (*fd_ctx->locked_on), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->locked_on) { - ret = -ENOMEM; - goto unlock; - } + fd_ctx->locked_on = GF_CALLOC (sizeof (*fd_ctx->locked_on), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->locked_on) { + ret = -ENOMEM; + goto out; + } - INIT_LIST_HEAD (&fd_ctx->paused_calls); - INIT_LIST_HEAD (&fd_ctx->entries); + pthread_mutex_init (&fd_ctx->delay_lock, NULL); + INIT_LIST_HEAD (&fd_ctx->entries); + fd_ctx->call_child = -1; - ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set fd ctx (%p)", fd); - } -unlock: - UNLOCK (&fd->lock); + INIT_LIST_HEAD (&fd_ctx->eager_locked); + + ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); + if (ret) + gf_log (this->name, GF_LOG_DEBUG, + "failed to set fd ctx (%p)", fd); out: return ret; } -/* {{{ flush */ int -afr_flush_unwind (call_frame_t *frame, xlator_t *this) +afr_fd_ctx_set (xlator_t *this, fd_t *fd) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; + int ret = -1; - LOCK (&frame->lock); + LOCK (&fd->lock); { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (flush, main_frame, - local->op_ret, local->op_errno); + ret = __afr_fd_ctx_set (this, fd); } + UNLOCK (&fd->lock); - return 0; + return ret; } +/* {{{ flush */ int -afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t * local = NULL; - afr_private_t * priv = NULL; int call_count = -1; - int child_index = (long) cookie; - int need_unwind = 0; local = frame->local; - priv = this->private; LOCK (&frame->lock); { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - if (op_ret != -1) { if (local->success_count == 0) { local->op_ret = op_ret; } local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } } local->op_errno = op_errno; } UNLOCK (&frame->lock); - if (need_unwind) - afr_flush_unwind (frame, this); + call_count = afr_frame_return (frame); - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } + if (call_count == 0) + AFR_STACK_UNWIND(flush, frame, local->op_ret, + local->op_errno, NULL); return 0; } - -int -afr_flush_wind (call_frame_t *frame, xlator_t *this) +static int +afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int i = 0; - int call_count = -1; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; - local = frame->local; priv = this->private; - - call_count = afr_up_children_count (local->child_up, priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; + local = frame->local; + call_count = local->call_count; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_flush_wind_cbk, + STACK_WIND_COOKIE (frame, afr_flush_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->flush, - local->fd); - + local->fd, NULL); if (!--call_count) break; + } } return 0; } - int -afr_flush_done (call_frame_t *frame, xlator_t *this) +afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - - -int -afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; - int call_count = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_stub_t *stub = NULL; + int ret = -1; + int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -2230,53 +2725,27 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - call_count = afr_up_children_count (local->child_up, priv->child_count); + ret = afr_local_init(local, priv, &op_errno); + if (ret < 0) + goto out; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { + local->fd = fd_ref(fd); + stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata); + if (!stub) { + ret = -1; op_errno = ENOMEM; goto out; } - transaction_frame->local = local; - - local->op = GF_FOP_FLUSH; - - local->transaction.fop = afr_flush_wind; - local->transaction.done = afr_flush_done; - local->transaction.unwind = afr_flush_unwind; - - local->fd = fd_ref (fd); - - local->transaction.main_frame = frame; - local->transaction.start = 0; - local->transaction.len = 0; - - ret = afr_open_fd_fix (transaction_frame, this, _gf_false); - if (ret) { - op_ret = -1; - op_errno = -ret; - goto out; - } - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - + afr_delayed_changelog_wake_resume (this, fd, stub); + ret = 0; - op_ret = 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - - AFR_STACK_UNWIND (flush, frame, op_ret, op_errno); - } + if (ret < 0) + AFR_STACK_UNWIND(flush, frame, -1, op_errno, NULL); return 0; } @@ -2290,8 +2759,6 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) uint64_t ctx = 0; afr_fd_ctx_t *fd_ctx = NULL; int ret = 0; - afr_fd_paused_call_t *paused_call = NULL; - afr_fd_paused_call_t *tmp = NULL; ret = fd_ctx_get (fd, this, &ctx); if (ret < 0) @@ -2300,28 +2767,18 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) fd_ctx = (afr_fd_ctx_t *)(long) ctx; if (fd_ctx) { - if (fd_ctx->pre_op_done) - GF_FREE (fd_ctx->pre_op_done); + GF_FREE (fd_ctx->pre_op_done); - if (fd_ctx->opened_on) - GF_FREE (fd_ctx->opened_on); + GF_FREE (fd_ctx->opened_on); - if (fd_ctx->locked_on) - GF_FREE (fd_ctx->locked_on); + GF_FREE (fd_ctx->locked_on); - if (fd_ctx->pre_op_piggyback) - GF_FREE (fd_ctx->pre_op_piggyback); - list_for_each_entry_safe (paused_call, tmp, &fd_ctx->paused_calls, - call_list) { - list_del_init (&paused_call->call_list); - GF_FREE (paused_call); - } + GF_FREE (fd_ctx->pre_op_piggyback); + GF_FREE (fd_ctx->lock_piggyback); - if (fd_ctx->lock_piggyback) - GF_FREE (fd_ctx->lock_piggyback); + GF_FREE (fd_ctx->lock_acquired); - if (fd_ctx->lock_acquired) - GF_FREE (fd_ctx->lock_acquired); + pthread_mutex_destroy (&fd_ctx->delay_lock); GF_FREE (fd_ctx); } @@ -2359,14 +2816,25 @@ afr_release (xlator_t *this, fd_t *fd) /* {{{ fsync */ int +afr_fsync_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} + +int afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { afr_local_t *local = NULL; int call_count = -1; int child_index = (long) cookie; int read_child = 0; + call_stub_t *stub = NULL; local = frame->local; @@ -2382,13 +2850,13 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_ret = 0; if (local->success_count == 0) { - local->cont.fsync.prebuf = *prebuf; - local->cont.fsync.postbuf = *postbuf; + local->cont.inode_wfop.prebuf = *prebuf; + local->cont.inode_wfop.postbuf = *postbuf; } if (child_index == read_child) { - local->cont.fsync.prebuf = *prebuf; - local->cont.fsync.postbuf = *postbuf; + local->cont.inode_wfop.prebuf = *prebuf; + local->cont.inode_wfop.postbuf = *postbuf; } local->success_count++; @@ -2401,9 +2869,32 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { - AFR_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno, - &local->cont.fsync.prebuf, - &local->cont.fsync.postbuf); + /* Make a stub out of the frame, and register it + with the waking up post-op. When the call-stub resumes, + we are guaranteed that there was no post-op pending + (i.e changelogs were unset in the server). This is an + essential "guarantee", that fsync() returns only after + completely finishing EVERYTHING, including the delayed + post-op. This guarantee is expected by FUSE graph switching + for example. + */ + stub = fop_fsync_cbk_stub (frame, afr_fsync_unwind_cbk, + local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + xdata); + if (!stub) { + AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); + return 0; + } + + /* If no new unstable writes happened between the + time we cleared the unstable write witness flag in afr_fsync + and now, calling afr_delayed_changelog_wake_up() should + wake up and skip over the fsync phase and go straight to + afr_changelog_post_op_now() + */ + afr_delayed_changelog_wake_resume (this, local->fd, stub); } return 0; @@ -2412,14 +2903,13 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync) + int32_t datasync, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -2428,19 +2918,20 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; local->fd = fd_ref (fd); - local->cont.fsync.ino = fd->inode->ino; + + if (afr_fd_has_witnessed_unstable_write (this, fd)) { + /* don't care. we only wanted to CLEAR the bit */ + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -2448,17 +2939,16 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, (void *) (long) i, priv->children[i], priv->children[i]->fops->fsync, - fd, datasync); + fd, datasync, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, NULL, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -2468,7 +2958,8 @@ out: int32_t afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xdata) { afr_local_t *local = NULL; int call_count = -1; @@ -2488,7 +2979,7 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } @@ -2496,14 +2987,13 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, int32_t afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync) + int32_t datasync, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -2512,33 +3002,30 @@ afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_fsyncdir_cbk, priv->children[i], priv->children[i]->fops->fsyncdir, - fd, datasync); + fd, datasync, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fsyncdir, frame, op_ret, op_errno); - } + if (ret < 0) + AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); return 0; } @@ -2549,7 +3036,7 @@ out: int32_t afr_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) + dict_t *xattr, dict_t *xdata) { afr_local_t *local = NULL; int call_count = -1; @@ -2558,8 +3045,11 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, LOCK (&frame->lock); { - if (op_ret == 0) + if (op_ret == 0) { + if (!local->cont.xattrop.xattr) + local->cont.xattrop.xattr = dict_ref (xattr); local->op_ret = 0; + } local->op_errno = op_errno; } @@ -2569,7 +3059,7 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (xattrop, frame, local->op_ret, local->op_errno, - xattr); + local->cont.xattrop.xattr, xdata); return 0; } @@ -2577,14 +3067,13 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, int32_t afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t optype, dict_t *xattr) + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -2593,33 +3082,30 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_xattrop_cbk, priv->children[i], priv->children[i]->fops->xattrop, - loc, optype, xattr); + loc, optype, xattr, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (xattrop, frame, op_ret, op_errno, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); return 0; } @@ -2630,7 +3116,7 @@ out: int32_t afr_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) + dict_t *xattr, dict_t *xdata) { afr_local_t *local = NULL; @@ -2640,8 +3126,12 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, LOCK (&frame->lock); { - if (op_ret == 0) + if (op_ret == 0) { + if (!local->cont.fxattrop.xattr) + local->cont.fxattrop.xattr = dict_ref (xattr); + local->op_ret = 0; + } local->op_errno = op_errno; } @@ -2651,7 +3141,7 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fxattrop, frame, local->op_ret, local->op_errno, - xattr); + local->cont.fxattrop.xattr, xdata); return 0; } @@ -2659,14 +3149,13 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, int32_t afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_xattrop_flags_t optype, dict_t *xattr) + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -2675,33 +3164,30 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_fxattrop_cbk, priv->children[i], priv->children[i]->fops->fxattrop, - fd, optype, xattr); + fd, optype, xattr, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); return 0; } @@ -2710,7 +3196,7 @@ out: int32_t afr_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -2731,7 +3217,7 @@ afr_inodelk_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (inodelk, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } @@ -2739,14 +3225,14 @@ afr_inodelk_cbk (call_frame_t *frame, void *cookie, int32_t afr_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock) + const char *volume, loc_t *loc, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -2755,41 +3241,39 @@ afr_inodelk (call_frame_t *frame, xlator_t *this, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_inodelk_cbk, priv->children[i], priv->children[i]->fops->inodelk, - volume, loc, cmd, flock); + volume, loc, cmd, flock, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (inodelk, frame, op_ret, op_errno); - } + if (ret < 0) + AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); return 0; } int32_t afr_finodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xdata) { afr_local_t *local = NULL; @@ -2810,7 +3294,7 @@ afr_finodelk_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (finodelk, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } @@ -2818,14 +3302,14 @@ afr_finodelk_cbk (call_frame_t *frame, void *cookie, int32_t afr_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock) + const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -2834,42 +3318,38 @@ afr_finodelk (call_frame_t *frame, xlator_t *this, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_finodelk_cbk, priv->children[i], priv->children[i]->fops->finodelk, - volume, fd, cmd, flock); + volume, fd, cmd, flock, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (finodelk, frame, op_ret, op_errno); - } + if (ret < 0) + AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); return 0; } int32_t -afr_entrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - +afr_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; int call_count = -1; @@ -2889,7 +3369,7 @@ afr_entrylk_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (entrylk, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } @@ -2898,14 +3378,14 @@ afr_entrylk_cbk (call_frame_t *frame, void *cookie, int32_t afr_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, - const char *basename, entrylk_cmd cmd, entrylk_type type) + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -2914,34 +3394,31 @@ afr_entrylk (call_frame_t *frame, xlator_t *this, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_entrylk_cbk, priv->children[i], priv->children[i]->fops->entrylk, - volume, loc, basename, cmd, type); + volume, loc, basename, cmd, type, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (entrylk, frame, op_ret, op_errno); - } + if (ret < 0) + AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); return 0; } @@ -2949,7 +3426,7 @@ out: int32_t afr_fentrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -2970,7 +3447,7 @@ afr_fentrylk_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fentrylk, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } @@ -2979,14 +3456,14 @@ afr_fentrylk_cbk (call_frame_t *frame, void *cookie, int32_t afr_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, - const char *basename, entrylk_cmd cmd, entrylk_type type) + const char *basename, entrylk_cmd cmd, + entrylk_type type, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -2995,41 +3472,38 @@ afr_fentrylk (call_frame_t *frame, xlator_t *this, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_fentrylk_cbk, priv->children[i], priv->children[i]->fops->fentrylk, - volume, fd, basename, cmd, type); + volume, fd, basename, cmd, type, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fentrylk, frame, op_ret, op_errno); - } + if (ret < 0) + AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); return 0; } int32_t afr_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct statvfs *statvfs) + struct statvfs *statvfs, dict_t *xdata) { afr_local_t *local = NULL; int call_count = 0; @@ -3060,7 +3534,7 @@ afr_statfs_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno, - &local->cont.statfs.buf); + &local->cont.statfs.buf, xdata); return 0; } @@ -3068,7 +3542,7 @@ afr_statfs_cbk (call_frame_t *frame, void *cookie, int32_t afr_statfs (call_frame_t *frame, xlator_t *this, - loc_t *loc) + loc_t *loc, dict_t *xdata) { afr_private_t * priv = NULL; int child_count = 0; @@ -3076,7 +3550,6 @@ afr_statfs (call_frame_t *frame, xlator_t *this, int i = 0; int ret = -1; int call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (this, out); @@ -3086,15 +3559,13 @@ afr_statfs (call_frame_t *frame, xlator_t *this, priv = this->private; child_count = priv->child_count; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - frame->local = local; call_count = local->call_count; for (i = 0; i < child_count; i++) { @@ -3102,24 +3573,24 @@ afr_statfs (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, afr_statfs_cbk, priv->children[i], priv->children[i]->fops->statfs, - loc); + loc, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (statfs, frame, op_ret, op_errno, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); return 0; } int32_t afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) { afr_local_t * local = NULL; int call_count = -1; @@ -3129,7 +3600,7 @@ afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (call_count == 0) AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, - lock); + lock, xdata); return 0; } @@ -3151,7 +3622,7 @@ afr_lk_unlock (call_frame_t *frame, xlator_t *this) if (call_count == 0) { AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, - &local->cont.lk.ret_flock); + &local->cont.lk.ret_flock, NULL); return 0; } @@ -3165,7 +3636,7 @@ afr_lk_unlock (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->lk, local->fd, F_SETLK, - &local->cont.lk.user_flock); + &local->cont.lk.user_flock, NULL); if (!--call_count) break; @@ -3178,7 +3649,7 @@ afr_lk_unlock (call_frame_t *frame, xlator_t *this) int32_t afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata) { afr_local_t *local = NULL; afr_private_t *priv = NULL; @@ -3213,12 +3684,12 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, priv->children[child_index], priv->children[child_index]->fops->lk, local->fd, local->cont.lk.cmd, - &local->cont.lk.user_flock); + &local->cont.lk.user_flock, xdata); } else if (local->op_ret == -1) { /* all nodes have gone down */ AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN, - &local->cont.lk.ret_flock); + &local->cont.lk.ret_flock, NULL); } else { /* locking has succeeded on all nodes that are up */ @@ -3236,7 +3707,7 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, */ AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, - &local->cont.lk.ret_flock); + &local->cont.lk.ret_flock, NULL); } return 0; @@ -3245,13 +3716,13 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int afr_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t cmd, struct gf_flock *flock) + fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; int i = 0; - int32_t op_ret = -1; int32_t op_errno = 0; + int ret = -1; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -3259,10 +3730,12 @@ afr_lk (call_frame_t *frame, xlator_t *this, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - AFR_LOCAL_INIT (local, priv); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - frame->local = local; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count, sizeof (*local->cont.lk.locked_nodes), @@ -3281,13 +3754,12 @@ afr_lk (call_frame_t *frame, xlator_t *this, STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0, priv->children[i], priv->children[i]->fops->lk, - fd, cmd, flock); + fd, cmd, flock, xdata); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (lk, frame, op_ret, op_errno, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); return 0; } @@ -3303,8 +3775,7 @@ afr_forget (xlator_t *this, inode_t *inode) goto out; ctx = (afr_inode_ctx_t *)(long)ctx_addr; - if (ctx->fresh_children) - GF_FREE (ctx->fresh_children); + GF_FREE (ctx->fresh_children); GF_FREE (ctx); out: return 0; @@ -3325,41 +3796,23 @@ afr_priv_dump (xlator_t *this) GF_ASSERT (priv); snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); gf_proc_dump_add_section(key_prefix); - gf_proc_dump_build_key(key, key_prefix, "child_count"); - gf_proc_dump_write(key, "%u", priv->child_count); - gf_proc_dump_build_key(key, key_prefix, "read_child_rr"); - gf_proc_dump_write(key, "%u", priv->read_child_rr); + gf_proc_dump_write("child_count", "%u", priv->child_count); + gf_proc_dump_write("read_child_rr", "%u", priv->read_child_rr); for (i = 0; i < priv->child_count; i++) { - gf_proc_dump_build_key(key, key_prefix, "child_up[%d]", i); + sprintf (key, "child_up[%d]", i); gf_proc_dump_write(key, "%d", priv->child_up[i]); - gf_proc_dump_build_key(key, key_prefix, - "pending_key[%d]", i); + sprintf (key, "pending_key[%d]", i); gf_proc_dump_write(key, "%s", priv->pending_key[i]); } - gf_proc_dump_build_key(key, key_prefix, "data_self_heal"); - gf_proc_dump_write(key, "%s", priv->data_self_heal); - gf_proc_dump_build_key(key, key_prefix, "metadata_self_heal"); - gf_proc_dump_write(key, "%d", priv->metadata_self_heal); - gf_proc_dump_build_key(key, key_prefix, "entry_self_heal"); - gf_proc_dump_write(key, "%d", priv->entry_self_heal); - gf_proc_dump_build_key(key, key_prefix, "data_change_log"); - gf_proc_dump_write(key, "%d", priv->data_change_log); - gf_proc_dump_build_key(key, key_prefix, "metadata_change_log"); - gf_proc_dump_write(key, "%d", priv->metadata_change_log); - gf_proc_dump_build_key(key, key_prefix, "entry_change_log"); - gf_proc_dump_write(key, "%d", priv->entry_change_log); - gf_proc_dump_build_key(key, key_prefix, "read_child"); - gf_proc_dump_write(key, "%d", priv->read_child); - gf_proc_dump_build_key(key, key_prefix, "favorite_child"); - gf_proc_dump_write(key, "%d", priv->favorite_child); - gf_proc_dump_build_key(key, key_prefix, "data_lock_server_count"); - gf_proc_dump_write(key, "%u", priv->data_lock_server_count); - gf_proc_dump_build_key(key, key_prefix, "metadata_lock_server_count"); - gf_proc_dump_write(key, "%u", priv->metadata_lock_server_count); - gf_proc_dump_build_key(key, key_prefix, "entry_lock_server_count"); - gf_proc_dump_write(key, "%u", priv->entry_lock_server_count); - gf_proc_dump_build_key(key, key_prefix, "wait_count"); - gf_proc_dump_write(key, "%u", priv->wait_count); + gf_proc_dump_write("data_self_heal", "%s", priv->data_self_heal); + gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal); + gf_proc_dump_write("entry_self_heal", "%d", priv->entry_self_heal); + gf_proc_dump_write("data_change_log", "%d", priv->data_change_log); + gf_proc_dump_write("metadata_change_log", "%d", priv->metadata_change_log); + gf_proc_dump_write("entry-change_log", "%d", priv->entry_change_log); + gf_proc_dump_write("read_child", "%d", priv->read_child); + gf_proc_dump_write("favorite_child", "%d", priv->favorite_child); + gf_proc_dump_write("wait_count", "%u", priv->wait_count); return 0; } @@ -3389,7 +3842,7 @@ find_child_index (xlator_t *this, xlator_t *child) int32_t afr_notify (xlator_t *this, int32_t event, - void *data, ...) + void *data, void *data2) { afr_private_t *priv = NULL; int i = -1; @@ -3402,12 +3855,22 @@ afr_notify (xlator_t *this, int32_t event, int ret = -1; int call_psh = 0; int up_child = AFR_ALL_CHILDREN; + dict_t *input = NULL; + dict_t *output = NULL; priv = this->private; if (!priv) return 0; + /* + * We need to reset this in case children come up in "staggered" + * fashion, so that we discover a late-arriving local subvolume. Note + * that we could end up issuing N lookups to the first subvolume, and + * O(N^2) overall, but N is small for AFR so it shouldn't be an issue. + */ + priv->did_discovery = _gf_false; + had_heard_from_all = 1; for (i = 0; i < priv->child_count; i++) { if (!priv->last_event[i]) { @@ -3418,7 +3881,7 @@ afr_notify (xlator_t *this, int32_t event, /* parent xlators dont need to know about every child_up, child_down * because of afr ha. If all subvolumes go down, child_down has * to be triggered. In that state when 1 subvolume comes up child_up - * needs to be triggered. dht optimises revalidate lookup by sending + * needs to be triggered. dht optimizes revalidate lookup by sending * it only to one of its subvolumes. When child up/down happens * for afr's subvolumes dht should be notified by child_modified. The * subsequent revalidate lookup happens on all the dht's subvolumes @@ -3435,9 +3898,19 @@ afr_notify (xlator_t *this, int32_t event, case GF_EVENT_CHILD_UP: LOCK (&priv->lock); { + /* + * This only really counts if the child was never up + * (value = -1) or had been down (value = 0). See + * comment at GF_EVENT_CHILD_DOWN for a more detailed + * explanation. + */ + if (priv->child_up[idx] != 1) { + priv->up_count++; + } priv->child_up[idx] = 1; - priv->up_count++; + call_psh = 1; + up_child = idx; for (i = 0; i < priv->child_count; i++) if (priv->child_up[i] == 1) up_children++; @@ -3447,12 +3920,6 @@ afr_notify (xlator_t *this, int32_t event, "going online.", ((xlator_t *)data)->name); } else { event = GF_EVENT_CHILD_MODIFIED; - gf_log (this->name, GF_LOG_INFO, "subvol %d came up, " - "start crawl", idx); - if (had_heard_from_all) { - call_psh = 1; - up_child = idx; - } } priv->last_event[idx] = event; @@ -3464,8 +3931,22 @@ afr_notify (xlator_t *this, int32_t event, case GF_EVENT_CHILD_DOWN: LOCK (&priv->lock); { + /* + * If a brick is down when we start, we'll get a + * CHILD_DOWN to indicate its initial state. There + * was never a CHILD_UP in this case, so if we + * increment "down_count" the difference between than + * and "up_count" will no longer be the number of + * children that are currently up. This has serious + * implications e.g. for quorum enforcement, so we + * don't increment these values unless the event + * represents an actual state transition between "up" + * (value = 1) and anything else. + */ + if (priv->child_up[idx] == 1) { + priv->down_count++; + } priv->child_up[idx] = 0; - priv->down_count++; for (i = 0; i < priv->child_count; i++) if (priv->child_up[i] == 0) @@ -3490,7 +3971,16 @@ afr_notify (xlator_t *this, int32_t event, priv->last_event[idx] = event; } UNLOCK (&priv->lock); + break; + + case GF_EVENT_TRANSLATOR_OP: + input = data; + output = data2; + ret = afr_xl_op (this, input, output); + goto out; + break; + default: propagate = 1; break; @@ -3533,18 +4023,13 @@ afr_notify (xlator_t *this, int32_t event, } } UNLOCK (&priv->lock); - if (up_children > 1) { - gf_log (this->name, GF_LOG_INFO, "All subvolumes came " - "up, start crawl"); - call_psh = 1; - } } ret = 0; if (propagate) ret = default_notify (this, event, data); - if (call_psh) - afr_proactive_self_heal (this, up_child); + if (call_psh && priv->shd.iamshd) + afr_proactive_self_heal ((void*) (long) up_child); out: return ret; @@ -3569,29 +4054,56 @@ afr_first_up_child (unsigned char *child_up, size_t child_count) } int -AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv) +afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) { + int ret = -1; + local->op_ret = -1; local->op_errno = EUCLEAN; - local->call_count = afr_up_children_count (priv->child_up, + + local->child_up = GF_CALLOC (priv->child_count, + sizeof (*local->child_up), + gf_afr_mt_char); + if (!local->child_up) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + memcpy (local->child_up, priv->child_up, + sizeof (*local->child_up) * priv->child_count); + local->call_count = afr_up_children_count (local->child_up, priv->child_count); if (local->call_count == 0) { gf_log (THIS->name, GF_LOG_INFO, "no subvolumes up"); - return -ENOTCONN; + if (op_errno) + *op_errno = ENOTCONN; + goto out; } + local->child_errno = GF_CALLOC (priv->child_count, + sizeof (*local->child_errno), + gf_afr_mt_int32_t); + if (!local->child_errno) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } - local->child_up = GF_CALLOC (sizeof (*local->child_up), - priv->child_count, - gf_afr_mt_char); - if (!local->child_up) { - return -ENOMEM; + local->transaction.postop_piggybacked = GF_CALLOC (priv->child_count, + sizeof (int), + gf_afr_mt_int32_t); + if (!local->transaction.postop_piggybacked) { + if (op_errno) + *op_errno = ENOMEM; + goto out; } - memcpy (local->child_up, priv->child_up, - sizeof (*local->child_up) * priv->child_count); + local->append_write = _gf_false; - return 0; + ret = 0; +out: + return ret; } int @@ -3600,16 +4112,6 @@ afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, { int ret = -ENOMEM; - lk->inode_locked_nodes = GF_CALLOC (sizeof (*lk->inode_locked_nodes), - child_count, gf_afr_mt_char); - if (NULL == lk->inode_locked_nodes) - goto out; - - lk->entry_locked_nodes = GF_CALLOC (sizeof (*lk->entry_locked_nodes), - child_count, gf_afr_mt_char); - if (NULL == lk->entry_locked_nodes) - goto out; - lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), child_count, gf_afr_mt_char); if (NULL == lk->locked_nodes) @@ -3629,10 +4131,62 @@ out: return ret; } +void +afr_matrix_cleanup (int32_t **matrix, unsigned int m) +{ + int i = 0; + + if (!matrix) + goto out; + for (i = 0; i < m; i++) { + GF_FREE (matrix[i]); + } + + GF_FREE (matrix); +out: + return; +} + +int32_t** +afr_matrix_create (unsigned int m, unsigned int n) +{ + int32_t **matrix = NULL; + int i = 0; + + matrix = GF_CALLOC (sizeof (*matrix), m, gf_afr_mt_int32_t); + if (!matrix) + goto out; + + for (i = 0; i < m; i++) { + matrix[i] = GF_CALLOC (sizeof (*matrix[i]), n, + gf_afr_mt_int32_t); + if (!matrix[i]) + goto out; + } + return matrix; +out: + afr_matrix_cleanup (matrix, m); + return NULL; +} + +int +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count) +{ + int ret = -ENOMEM; + + lk->domain = dom; + lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), + child_count, gf_afr_mt_char); + if (NULL == lk->locked_nodes) + goto out; + ret = 0; +out: + return ret; +} + int afr_transaction_local_init (afr_local_t *local, xlator_t *this) { - int i = 0; int child_up_count = 0; int ret = -ENOMEM; afr_private_t *priv = NULL; @@ -3643,6 +4197,14 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) if (ret < 0) goto out; + if ((local->transaction.type == AFR_DATA_TRANSACTION) || + (local->transaction.type == AFR_METADATA_TRANSACTION)) { + ret = afr_inodelk_init (&local->internal_lock.inodelk[0], + this->name, priv->child_count); + if (ret < 0) + goto out; + } + ret = -ENOMEM; child_up_count = afr_up_children_count (local->child_up, priv->child_count); @@ -3652,12 +4214,6 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) local->first_up_child = afr_first_up_child (local->child_up, priv->child_count); - local->child_errno = GF_CALLOC (sizeof (*local->child_errno), - priv->child_count, - gf_afr_mt_int32_t); - if (!local->child_errno) - goto out; - local->transaction.eager_lock = GF_CALLOC (sizeof (*local->transaction.eager_lock), priv->child_count, @@ -3666,38 +4222,27 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) if (!local->transaction.eager_lock) goto out; - local->pending = GF_CALLOC (sizeof (*local->pending), - priv->child_count, - gf_afr_mt_int32_t); - - if (!local->pending) - goto out; - local->fresh_children = afr_children_create (priv->child_count); if (!local->fresh_children) goto out; - if (local->fd) { - local->fd_open_on = GF_CALLOC (sizeof (*local->fd_open_on), + local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op), priv->child_count, - gf_afr_mt_int32_t); - if (!local->fd_open_on) - goto out; - } + gf_afr_mt_char); + if (!local->transaction.pre_op) + goto out; - for (i = 0; i < priv->child_count; i++) { - local->pending[i] = GF_CALLOC (sizeof (*local->pending[i]), - 3, /* data + metadata + entry */ - gf_afr_mt_int32_t); - if (!local->pending[i]) - goto out; - } + local->pending = afr_matrix_create (priv->child_count, + AFR_NUM_CHANGE_LOGS); + if (!local->pending) + goto out; - local->transaction.child_errno = - GF_CALLOC (sizeof (*local->transaction.child_errno), - priv->child_count, - gf_afr_mt_int32_t); - local->transaction.erase_pending = 1; + local->transaction.txn_changelog = afr_matrix_create (priv->child_count, + AFR_NUM_CHANGE_LOGS); + if (!local->transaction.txn_changelog) + goto out; + + INIT_LIST_HEAD (&local->transaction.eager_locked); ret = 0; out: @@ -3790,3 +4335,257 @@ afr_set_low_priority (call_frame_t *frame) { frame->root->pid = LOW_PRIO_PROC_PID; } + +int +afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child, + int flags) +{ + int ret = 0; + uint64_t ctx = 0; + afr_fd_ctx_t *fd_ctx = NULL; + + GF_ASSERT (fd && fd->inode); + ret = afr_fd_ctx_set (this, fd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not set fd ctx for fd=%p", fd); + goto out; + } + + ret = fd_ctx_get (fd, this, &ctx); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not get fd ctx for fd=%p", fd); + goto out; + } + + fd_ctx = (afr_fd_ctx_t *)(long) ctx; + fd_ctx->opened_on[child] = AFR_FD_OPENED; + if (!IA_ISDIR (fd->inode->ia_type)) { + fd_ctx->flags = flags; + } + ret = 0; +out: + return ret; +} + +gf_boolean_t +afr_have_quorum (char *logname, afr_private_t *priv) +{ + unsigned int quorum = 0; + + GF_VALIDATE_OR_GOTO(logname,priv,out); + + quorum = priv->quorum_count; + if (quorum != AFR_QUORUM_AUTO) { + return (priv->up_count >= (priv->down_count + quorum)); + } + + quorum = priv->child_count / 2 + 1; + if (priv->up_count >= (priv->down_count + quorum)) { + return _gf_true; + } + + /* + * Special case for even numbers of nodes: if we have exactly half + * and that includes the first ("senior-most") node, then that counts + * as quorum even if it wouldn't otherwise. This supports e.g. N=2 + * while preserving the critical property that there can only be one + * such group. + */ + if ((priv->child_count % 2) == 0) { + quorum = priv->child_count / 2; + if (priv->up_count >= (priv->down_count + quorum)) { + if (priv->child_up[0]) { + return _gf_true; + } + } + } + +out: + return _gf_false; +} + +void +afr_priv_destroy (afr_private_t *priv) +{ + int i = 0; + + if (!priv) + goto out; + inode_unref (priv->root_inode); + GF_FREE (priv->shd.pos); + GF_FREE (priv->shd.pending); + GF_FREE (priv->shd.inprogress); +// for (i = 0; i < priv->child_count; i++) +// if (priv->shd.timer && priv->shd.timer[i]) +// gf_timer_call_cancel (this->ctx, priv->shd.timer[i]); + GF_FREE (priv->shd.timer); + + if (priv->shd.healed) + eh_destroy (priv->shd.healed); + + if (priv->shd.heal_failed) + eh_destroy (priv->shd.heal_failed); + + if (priv->shd.split_brain) + eh_destroy (priv->shd.split_brain); + + for (i = 0; i < priv->child_count; i++) + { + if (priv->shd.statistics[i]) + eh_destroy (priv->shd.statistics[i]); + } + + GF_FREE (priv->shd.statistics); + + GF_FREE (priv->shd.crawl_events); + + GF_FREE (priv->last_event); + if (priv->pending_key) { + for (i = 0; i < priv->child_count; i++) + GF_FREE (priv->pending_key[i]); + } + GF_FREE (priv->pending_key); + GF_FREE (priv->children); + GF_FREE (priv->child_up); + LOCK_DESTROY (&priv->lock); + LOCK_DESTROY (&priv->read_child_lock); + pthread_mutex_destroy (&priv->mutex); + GF_FREE (priv); +out: + return; +} + +int +xlator_subvolume_count (xlator_t *this) +{ + int i = 0; + xlator_list_t *list = NULL; + + for (list = this->children; list; list = list->next) + i++; + return i; +} + +inline gf_boolean_t +afr_is_errno_set (int *child_errno, int child) +{ + return child_errno[child]; +} + +inline gf_boolean_t +afr_is_errno_unset (int *child_errno, int child) +{ + return !afr_is_errno_set (child_errno, child); +} + +void +afr_prepare_new_entry_pending_matrix (int32_t **pending, + gf_boolean_t (*is_pending) (int *, int), + int *ctx, struct iatt *buf, + unsigned int child_count) +{ + int midx = 0; + int idx = 0; + int i = 0; + + midx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); + if (IA_ISDIR (buf->ia_type)) + idx = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); + else if (IA_ISREG (buf->ia_type)) + idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); + else + idx = -1; + for (i = 0; i < child_count; i++) { + if (is_pending (ctx, i)) { + pending[i][midx] = hton32 (1); + if (idx == -1) + continue; + pending[i][idx] = hton32 (1); + } + } +} + +gf_boolean_t +afr_is_fd_fixable (fd_t *fd) +{ + if (!fd || !fd->inode) + return _gf_false; + else if (fd_is_anonymous (fd)) + return _gf_false; + else if (uuid_is_null (fd->inode->gfid)) + return _gf_false; + + return _gf_true; +} + +void +afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + inode_t *inode = NULL; + afr_inode_ctx_t *ctx = NULL; + + local = frame->local; + + if (local->fd) + inode = local->fd->inode; + else + inode = local->loc.inode; + + if (!inode) + return; + + LOCK (&inode->lock); + { + ctx = __afr_inode_ctx_get (inode, this); + ctx->open_fd_count = local->open_fd_count; + } + UNLOCK (&inode->lock); +} + +int +afr_initialise_statistics (xlator_t *this) +{ + afr_private_t *priv = NULL; + int ret = -1; + int i = 0; + int child_count = 0; + eh_t *stats_per_brick = NULL; + shd_crawl_event_t ***shd_crawl_events = NULL; + priv = this->private; + + priv->shd.statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count, + gf_common_mt_eh_t); + if (!priv->shd.statistics) { + ret = -1; + goto out; + } + child_count = priv->child_count; + for (i=0; i < child_count ; i++) { + stats_per_brick = eh_new (AFR_STATISTICS_HISTORY_SIZE, + _gf_false, + _destroy_crawl_event_data); + if (!stats_per_brick) { + ret = -1; + goto out; + } + priv->shd.statistics[i] = stats_per_brick; + + } + + shd_crawl_events = (shd_crawl_event_t***)(&priv->shd.crawl_events); + *shd_crawl_events = GF_CALLOC (sizeof(shd_crawl_event_t*), + priv->child_count, + gf_afr_mt_shd_crawl_event_t); + + if (!priv->shd.crawl_events) { + ret = -1; + goto out; + } + ret = 0; +out: + return ret; + +} diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index 57f0a03fa..689dd84e6 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -51,7 +42,7 @@ int afr_examine_dir_sh_unwind (call_frame_t *frame, xlator_t *this, int32_t op_ret, - int32_t op_errno) + int32_t op_errno, int32_t sh_failed) { afr_local_t *local = NULL; @@ -60,7 +51,7 @@ afr_examine_dir_sh_unwind (call_frame_t *frame, xlator_t *this, int32_t op_ret, afr_set_opendir_done (this, local->fd->inode); AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); + local->op_errno, local->fd, NULL); return 0; } @@ -99,7 +90,7 @@ __checksums_differ (uint32_t *checksum, int child_count, int32_t afr_examine_dir_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) + gf_dirent_t *entries, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; @@ -137,7 +128,7 @@ afr_examine_dir_readdir_cbk (call_frame_t *frame, void *cookie, } list_for_each_entry_safe (entry, tmp, &entries->list, list) { - entry_cksum = gf_rsync_weak_checksum (entry->d_name, + entry_cksum = gf_rsync_weak_checksum ((unsigned char *)entry->d_name, strlen (entry->d_name)); local->cont.opendir.checksum[child_index] ^= entry_cksum; } @@ -152,7 +143,7 @@ afr_examine_dir_readdir_cbk (call_frame_t *frame, void *cookie, (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->readdir, - local->fd, 131072, last_offset); + local->fd, 131072, last_offset, NULL); return 0; @@ -175,7 +166,7 @@ out: afr_set_opendir_done (this, inode); AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); + local->op_errno, local->fd, NULL); } } @@ -208,7 +199,7 @@ afr_examine_dir (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->readdir, - local->fd, 131072, 0); + local->fd, 131072, 0, NULL); if (!--call_count) break; @@ -222,27 +213,37 @@ afr_examine_dir (call_frame_t *frame, xlator_t *this) int32_t afr_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd) + fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; int32_t up_children_count = 0; int ret = -1; int call_count = -1; + int32_t child_index = 0; priv = this->private; local = frame->local; + child_index = (long) cookie; up_children_count = afr_up_children_count (local->child_up, priv->child_count); LOCK (&frame->lock); { - if (op_ret >= 0) + if (op_ret >= 0) { local->op_ret = op_ret; + ret = afr_child_fd_ctx_set (this, fd, child_index, 0); + if (ret) { + local->op_ret = -1; + local->op_errno = -ret; + goto unlock; + } + } local->op_errno = op_errno; } +unlock: UNLOCK (&frame->lock); call_count = afr_frame_return (frame); @@ -251,17 +252,8 @@ afr_opendir_cbk (call_frame_t *frame, void *cookie, if (local->op_ret != 0) goto out; - ret = afr_fd_ctx_set (this, local->fd); - if (ret) { - local->op_ret = -1; - local->op_errno = -1; - gf_log (this->name, GF_LOG_ERROR, - "failed to set fd ctx for fd %p", - local->fd); - goto out; - } if (!afr_is_opendir_done (this, local->fd->inode) && - up_children_count > 1) { + up_children_count > 1 && priv->entry_self_heal) { /* * This is the first opendir on this inode. We need @@ -270,7 +262,7 @@ afr_opendir_cbk (call_frame_t *frame, void *cookie, * to regular entry self-heal because the readdir * call is sent only to the first subvolume, and * thus files that exist only there will never be healed - * otherwise (assuming changelog shows no anamolies). + * otherwise (assuming changelog shows no anomalies). */ gf_log (this->name, GF_LOG_TRACE, @@ -289,7 +281,7 @@ afr_opendir_cbk (call_frame_t *frame, void *cookie, out: AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); + local->op_errno, local->fd, NULL); return 0; } @@ -305,7 +297,6 @@ afr_opendir (call_frame_t *frame, xlator_t *this, int i = 0; int ret = -1; int call_count = -1; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -316,37 +307,36 @@ afr_opendir (call_frame_t *frame, xlator_t *this, child_count = priv->child_count; - ALLOC_OR_GOTO (local, afr_local_t, out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } loc_copy (&local->loc, loc); - frame->local = local; local->fd = fd_ref (fd); call_count = local->call_count; for (i = 0; i < child_count; i++) { if (local->child_up[i]) { - STACK_WIND (frame, afr_opendir_cbk, - priv->children[i], - priv->children[i]->fops->opendir, - loc, fd); + STACK_WIND_COOKIE (frame, afr_opendir_cbk, + (void*) (long) i, + priv->children[i], + priv->children[i]->fops->opendir, + loc, fd, NULL); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (opendir, frame, op_ret, op_errno, fd); - } + if (ret < 0) + AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL); return 0; } @@ -368,85 +358,6 @@ struct entry_name { struct list_head list; }; - -static gf_boolean_t -remembered_name (const char *name, struct list_head *entries) -{ - struct entry_name *e = NULL; - gf_boolean_t ret = _gf_false; - - list_for_each_entry (e, entries, list) { - if (!strcmp (name, e->name)) { - ret = _gf_true; - goto out; - } - } - -out: - return ret; -} - - -static void -afr_remember_entries (gf_dirent_t *entries, fd_t *fd) -{ - struct entry_name *n = NULL; - gf_dirent_t *entry = NULL; - int ret = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - ret = fd_ctx_get (fd, THIS, &ctx); - if (ret < 0) { - gf_log (THIS->name, GF_LOG_INFO, - "could not get fd ctx for fd=%p", fd); - return; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - list_for_each_entry (entry, &entries->list, list) { - n = GF_CALLOC (1, sizeof (*n), gf_afr_mt_entry_name); - n->name = gf_strdup (entry->d_name); - INIT_LIST_HEAD (&n->list); - - list_add (&n->list, &fd_ctx->entries); - } -} - - -static off_t -afr_filter_entries (gf_dirent_t *entries, fd_t *fd) -{ - gf_dirent_t *entry = NULL; - gf_dirent_t *tmp = NULL; - int ret = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - off_t offset = 0; - - ret = fd_ctx_get (fd, THIS, &ctx); - if (ret < 0) { - gf_log (THIS->name, GF_LOG_INFO, - "could not get fd ctx for fd=%p", fd); - return -1; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - offset = entry->d_off; - - if (remembered_name (entry->d_name, &fd_ctx->entries)) { - list_del (&entry->list); - GF_FREE (entry); - } - } - - return offset; -} - - static void afr_forget_entries (fd_t *fd) { @@ -472,174 +383,70 @@ afr_forget_entries (fd_t *fd) } } - -int32_t -afr_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) +static void +afr_readdir_filter_trash_dir (gf_dirent_t *entries, fd_t *fd) { - afr_local_t * local = NULL; gf_dirent_t * entry = NULL; gf_dirent_t * tmp = NULL; - local = frame->local; - - if (op_ret == -1) - goto out; - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - if ((local->fd->inode == local->fd->inode->table->root) - && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { + if (__is_root_gfid (fd->inode->gfid) && + !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { list_del_init (&entry->list); GF_FREE (entry); } } - -out: - AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries); - - return 0; } - int32_t -afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries) +afr_readdir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + gf_dirent_t *entries, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int32_t next_call_child = -1; - int ret = 0; - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; - int32_t *last_index = NULL; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - off_t offset = 0; - int32_t call_child = -1; + afr_local_t *local = NULL; - priv = this->private; - children = priv->children; + if (op_ret == -1) + goto out; local = frame->local; + afr_readdir_filter_trash_dir (entries, local->fd); - read_child = (long) cookie; - last_index = &local->cont.readdir.last_index; - fresh_children = local->fresh_children; - - /* the value of the last_index changes if afr_next_call_child is - * called. So to find the call_child of this callback use last_index - * before the next_call_child call. - */ - if (*last_index == -1) - call_child = read_child; - else - call_child = fresh_children[*last_index]; - - if (priv->strict_readdir) { - ret = fd_ctx_get (local->fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "could not get fd ctx for fd=%p", local->fd); - op_ret = -1; - op_errno = -ret; - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - if (op_ret == -1) { - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, - read_child); - if (next_call_child < 0) - goto out; - gf_log (this->name, GF_LOG_TRACE, - "starting readdir afresh on child %d, offset %"PRId64, - next_call_child, (uint64_t) 0); - - fd_ctx->failed_over = _gf_true; - - STACK_WIND_COOKIE (frame, afr_readdirp_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->readdirp, - local->fd, - local->cont.readdir.size, 0); - return 0; - } - } - - if (op_ret != -1) { - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - if ((local->fd->inode == local->fd->inode->table->root) - && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { - list_del_init (&entry->list); - GF_FREE (entry); - } - } - } - - if (priv->strict_readdir) { - if (fd_ctx->failed_over) { - if (list_empty (&entries->list)) { - gf_log (this->name, GF_LOG_DEBUG, - "no entries found"); - goto out; - } - - offset = afr_filter_entries (entries, local->fd); +out: + AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, NULL); + return 0; +} - afr_remember_entries (entries, local->fd); - if (list_empty (&entries->list)) { - /* All the entries we got were duplicate. We - shouldn't send an empty list now, because - that'll make the application stop reading. So - try to get more entries */ +int32_t +afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + afr_local_t *local = NULL; - gf_log (this->name, GF_LOG_TRACE, - "trying to fetch non-duplicate entries " - "from offset %"PRId64", child %s", - offset, children[call_child]->name); + if (op_ret == -1) + goto out; - STACK_WIND_COOKIE (frame, afr_readdirp_cbk, - (void *) (long) read_child, - children[call_child], - children[call_child]->fops->readdirp, - local->fd, local->cont.readdir.size, offset); - return 0; - } - } else { - afr_remember_entries (entries, local->fd); - } - } + local = frame->local; + afr_readdir_filter_trash_dir (entries, local->fd); out: - AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries); - + AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, NULL); return 0; } int32_t afr_do_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, int whichop) + fd_t *fd, size_t size, off_t offset, int whichop, dict_t *dict) { - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int ret = -1; - int32_t op_ret = -1; - int32_t op_errno = 0; - uint64_t read_child = 0; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = -1; + int32_t op_errno = 0; + uint64_t read_child = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -648,14 +455,12 @@ afr_do_readdir (call_frame_t *frame, xlator_t *this, priv = this->private; children = priv->children; - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } local->fresh_children = afr_children_create (priv->child_count); if (!local->fresh_children) { @@ -665,79 +470,67 @@ afr_do_readdir (call_frame_t *frame, xlator_t *this, read_child = afr_inode_get_read_ctx (this, fd->inode, local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.readdir.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; + ret = afr_get_call_child (this, local->child_up, read_child, + local->fresh_children, + &call_child, + &local->cont.readdir.last_index); + if (ret < 0) { + op_errno = -ret; goto out; } - local->fd = fd_ref (fd); - local->cont.readdir.size = size; - - if (priv->strict_readdir) { - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "could not get fd ctx for fd=%p", fd); - op_errno = -ret; - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - if (fd_ctx->last_tried != call_child) { - gf_log (this->name, GF_LOG_TRACE, - "first up child has changed from %d to %d, " - "restarting readdir from offset 0", - fd_ctx->last_tried, call_child); - - fd_ctx->failed_over = _gf_true; - offset = 0; - } + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) { + op_errno = EBADF; + goto out; + } - fd_ctx->last_tried = call_child; + if ((offset == 0) || (fd_ctx->call_child == -1)) { + fd_ctx->call_child = call_child; + } else if ((priv->readdir_failover == _gf_false) && + (call_child != fd_ctx->call_child)) { + op_errno = EBADF; + goto out; } + local->fd = fd_ref (fd); + local->cont.readdir.size = size; + local->cont.readdir.dict = (dict)? dict_ref (dict) : NULL; + if (whichop == GF_FOP_READDIR) STACK_WIND_COOKIE (frame, afr_readdir_cbk, (void *) (long) call_child, children[call_child], children[call_child]->fops->readdir, fd, - size, offset); + size, offset, dict); else STACK_WIND_COOKIE (frame, afr_readdirp_cbk, (void *) (long) call_child, children[call_child], children[call_child]->fops->readdirp, fd, - size, offset); + size, offset, dict); - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, NULL); - } + AFR_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL); return 0; } int32_t afr_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) + off_t offset, dict_t *xdata) { - afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR); + afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata); return 0; } int32_t afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) + off_t offset, dict_t *dict) { - afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP); + afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP, dict); return 0; } diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h index 6a6bc6354..09456d159 100644 --- a/xlators/cluster/afr/src/afr-dir-read.h +++ b/xlators/cluster/afr/src/afr-dir-read.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __DIR_READ_H__ @@ -23,23 +14,23 @@ int32_t afr_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd); + loc_t *loc, fd_t *fd, dict_t *xdata); int32_t afr_releasedir (xlator_t *this, fd_t *fd); int32_t afr_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset); + fd_t *fd, size_t size, off_t offset, dict_t *xdata); int32_t afr_readdirp (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset); + fd_t *fd, size_t size, off_t offset, dict_t *dict); int32_t afr_checksum (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags); + loc_t *loc, int32_t flags, dict_t *xdata); #endif /* __DIR_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 9a17c2030..1943b719b 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -47,33 +38,222 @@ #include "afr.h" #include "afr-transaction.h" +int +afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno) +{ + int ret = -1; + char *child_path = NULL; + + if (!child->parent) { + if (op_errno) + *op_errno = EINVAL; + goto out; + } + + child_path = gf_strdup (child->path); + if (!child_path) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + parent->path = gf_strdup( dirname (child_path) ); + if (!parent->path) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + parent->inode = inode_ref (child->parent); + uuid_copy (parent->gfid, child->pargfid); + + ret = 0; +out: + GF_FREE(child_path); + + return ret; +} void -afr_build_parent_loc (loc_t *parent, loc_t *child) +__dir_entry_fop_common_cbk (call_frame_t *frame, int child_index, + xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, struct iatt *prenewparent, + struct iatt *postnewparent) { - char *tmp = NULL; + afr_local_t *local = NULL; - if (!child->parent) { - //this should never be called with root as the child - GF_ASSERT (0); - loc_copy (parent, child); - return; + local = frame->local; + + if (afr_fop_failed (op_ret, op_errno)) + afr_transaction_fop_failed (frame, this, child_index); + + if (op_ret > -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) || + (child_index == local->read_child_index)) { + local->cont.dir_fop.preparent = *preparent; + local->cont.dir_fop.postparent = *postparent; + if (buf) + local->cont.dir_fop.buf = *buf; + if (prenewparent) + local->cont.dir_fop.prenewparent = *prenewparent; + if (postnewparent) + local->cont.dir_fop.postnewparent = *postnewparent; + } + + local->cont.dir_fop.inode = inode; + + local->fresh_children[local->success_count] = child_index; + local->success_count++; + local->child_errno[child_index] = 0; + } else { + local->child_errno[child_index] = op_errno; } - tmp = gf_strdup (child->path); - parent->path = gf_strdup (dirname (tmp)); - GF_FREE (tmp); + local->op_errno = op_errno; +} - parent->name = strrchr (parent->path, '/'); - if (parent->name) - parent->name++; +int +afr_mark_new_entry_changelog_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t *xattr, dict_t *xdata) +{ + int call_count = 0; - parent->inode = inode_ref (child->parent); - parent->parent = inode_parent (parent->inode, 0, NULL); - parent->ino = parent->inode->ino; + call_count = afr_frame_return (frame); + if (call_count == 0) { + AFR_STACK_DESTROY (frame); + } + return 0; +} + +void +afr_mark_new_entry_changelog (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *new_frame = NULL; + afr_local_t *local = NULL; + afr_local_t *new_local = NULL; + afr_private_t *priv = NULL; + dict_t **xattr = NULL; + int32_t **changelog = NULL; + int i = 0; + GF_UNUSED int op_errno = 0; + + local = frame->local; + priv = this->private; + + new_frame = copy_frame (frame); + if (!new_frame) { + goto out; + } + + AFR_LOCAL_ALLOC_OR_GOTO (new_frame->local, out); + new_local = new_frame->local; + changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!changelog) + goto out; - if (!uuid_is_null (child->pargfid)) - uuid_copy (parent->gfid, child->pargfid); + xattr = GF_CALLOC (priv->child_count, sizeof (*xattr), + gf_afr_mt_dict_t); + if (!xattr) + goto out; + for (i = 0; i < priv->child_count; i++) { + if (local->child_errno[i]) + continue; + xattr[i] = dict_new (); + if (!xattr[i]) + goto out; + } + + afr_prepare_new_entry_pending_matrix (changelog, + afr_is_errno_set, + local->child_errno, + &local->cont.dir_fop.buf, + priv->child_count); + + new_local->pending = changelog; + uuid_copy (new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid); + new_local->loc.inode = inode_ref (local->cont.dir_fop.inode); + new_local->call_count = local->success_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_errno[i]) + continue; + + afr_set_pending_dict (priv, xattr[i], changelog, i, LOCAL_LAST); + STACK_WIND_COOKIE (new_frame, afr_mark_new_entry_changelog_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->xattrop, + &new_local->loc, GF_XATTROP_ADD_ARRAY, + xattr[i], NULL); + } + new_frame = NULL; +out: + if (new_frame) + AFR_STACK_DESTROY (new_frame); + afr_xattr_array_destroy (xattr, priv->child_count); + return; +} + +gf_boolean_t +afr_is_new_entry_changelog_needed (glusterfs_fop_t fop) +{ + glusterfs_fop_t fops[] = {GF_FOP_CREATE, GF_FOP_MKNOD, GF_FOP_NULL}; + int i = 0; + + for (i = 0; fops[i] != GF_FOP_NULL; i++) { + if (fop == fops[i]) + return _gf_true; + } + return _gf_false; +} + +void +afr_dir_fop_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (local->op_ret < 0) + goto out; + + if (local->success_count == priv->child_count) + goto out; + + if (!afr_is_new_entry_changelog_needed (local->op)) + goto out; + + afr_mark_new_entry_changelog (frame, this); + +out: + return; +} + +void +afr_dir_fop_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (local->cont.dir_fop.inode == NULL) + goto done; + afr_set_read_ctx_from_policy (this, local->cont.dir_fop.inode, + local->fresh_children, + local->read_child_index, + priv->read_child, + local->cont.dir_fop.buf.ia_gfid); +done: + local->transaction.unwind (frame, this); + afr_dir_fop_mark_entry_pending_changelog (frame, this); + local->transaction.resume (frame, this); } /* {{{ create */ @@ -83,7 +263,6 @@ afr_create_unwind (call_frame_t *frame, xlator_t *this) { call_frame_t *main_frame = NULL; afr_local_t *local = NULL; - struct iatt *unwind_buf = NULL; local = frame->local; @@ -97,18 +276,14 @@ afr_create_unwind (call_frame_t *frame, xlator_t *this) UNLOCK (&frame->lock); if (main_frame) { - if (local->cont.create.read_child_buf.ia_ino) { - unwind_buf = &local->cont.create.read_child_buf; - } else { - unwind_buf = &local->cont.create.buf; - } - AFR_STACK_UNWIND (create, main_frame, local->op_ret, local->op_errno, local->cont.create.fd, - local->cont.create.inode, - unwind_buf, &local->cont.create.preparent, - &local->cont.create.postparent); + local->cont.dir_fop.inode, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + local->xdata_rsp); } return 0; @@ -119,32 +294,24 @@ int afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, struct iatt *buf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { afr_local_t *local = NULL; - afr_private_t *priv = NULL; uint64_t ctx = 0; afr_fd_ctx_t *fd_ctx = NULL; int ret = 0; int call_count = -1; int child_index = -1; - int32_t *fresh_children = NULL; local = frame->local; - priv = this->private; child_index = (long) cookie; LOCK (&frame->lock); { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - + if (op_ret > -1) { ret = afr_fd_ctx_set (this, fd); - if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, "could not set ctx on fd=%p", fd); @@ -155,7 +322,6 @@ afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, "could not get fd ctx for fd=%p", fd); @@ -169,23 +335,14 @@ afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, fd_ctx->opened_on[child_index] = AFR_FD_OPENED; fd_ctx->flags = local->cont.create.flags; - if (local->success_count == 0) - local->cont.create.buf = *buf; - - if (child_index == local->read_child_index) { - local->cont.create.read_child_buf = *buf; - local->cont.create.preparent = *preparent; - local->cont.create.postparent = *postparent; - } - - local->cont.create.inode = inode; - - fresh_children = local->fresh_children; - fresh_children[local->success_count] = child_index; - local->success_count++; + if (local->success_count == 0) { + if (xdata) + local->xdata_rsp = dict_ref(xdata); + } } - - local->op_errno = op_errno; + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, inode, buf, + preparent, postparent, NULL, NULL); } unlock: @@ -193,15 +350,8 @@ unlock: call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_set_read_ctx_from_policy (this, inode, - local->fresh_children, - local->read_child_index, - priv->read_child); - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } + if (call_count == 0) + afr_dir_fop_done (frame, this); return 0; } @@ -212,16 +362,14 @@ afr_create_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; - int_lock = &local->internal_lock; - call_count = afr_locked_children_count (int_lock->entry_locked_nodes, - priv->child_count); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -231,7 +379,7 @@ afr_create_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && int_lock->entry_locked_nodes[i]) { + if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_create_wind_cbk, (void *) (long) i, priv->children[i], @@ -239,8 +387,9 @@ afr_create_wind (call_frame_t *frame, xlator_t *this) &local->loc, local->cont.create.flags, local->cont.create.mode, + local->umask, local->cont.create.fd, - local->cont.create.params); + local->xdata_req); if (!--call_count) break; } @@ -268,14 +417,14 @@ afr_create_done (call_frame_t *frame, xlator_t *this) int afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) + mode_t umask, fd_t *fd, dict_t *params) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -283,20 +432,20 @@ afr_create (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(create,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; loc_copy (&local->loc, loc); @@ -307,33 +456,49 @@ afr_create (call_frame_t *frame, xlator_t *this, } UNLOCK (&priv->read_child_lock); + local->op = GF_FOP_CREATE; local->cont.create.flags = flags; local->cont.create.mode = mode; local->cont.create.fd = fd_ref (fd); + local->umask = umask; if (params) - local->cont.create.params = dict_ref (params); - - if (loc->parent) - local->cont.create.parent_ino = loc->parent->ino; + local->xdata_req = dict_ref (params); local->transaction.fop = afr_create_wind; local->transaction.done = afr_create_done; local->transaction.unwind = afr_create_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, loc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (create, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL, NULL); + AFR_STACK_UNWIND (create, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL, NULL); } return 0; @@ -348,7 +513,6 @@ afr_mknod_unwind (call_frame_t *frame, xlator_t *this) { call_frame_t *main_frame = NULL; afr_local_t *local = NULL; - struct iatt *unwind_buf = NULL; local = frame->local; @@ -362,17 +526,13 @@ afr_mknod_unwind (call_frame_t *frame, xlator_t *this) UNLOCK (&frame->lock); if (main_frame) { - if (local->cont.mknod.read_child_buf.ia_ino) { - unwind_buf = &local->cont.mknod.read_child_buf; - } else { - unwind_buf = &local->cont.mknod.buf; - } - AFR_STACK_UNWIND (mknod, main_frame, local->op_ret, local->op_errno, - local->cont.mknod.inode, - unwind_buf, &local->cont.mknod.preparent, - &local->cont.mknod.postparent); + local->cont.dir_fop.inode, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + NULL); } return 0; @@ -383,58 +543,25 @@ int afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; int call_count = -1; int child_index = -1; - int32_t *fresh_children = NULL; - - local = frame->local; - priv = this->private; child_index = (long) cookie; LOCK (&frame->lock); { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0) - local->cont.mknod.buf = *buf; - - if (child_index == local->read_child_index) { - local->cont.mknod.read_child_buf = *buf; - local->cont.mknod.preparent = *preparent; - local->cont.mknod.postparent = *postparent; - } - - local->cont.mknod.inode = inode; - - fresh_children = local->fresh_children; - fresh_children[local->success_count] = child_index; - local->success_count++; - } - - local->op_errno = op_errno; + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, inode, buf, + preparent, postparent, NULL, NULL); } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_set_read_ctx_from_policy (this, inode, - local->fresh_children, - local->read_child_index, - priv->read_child); - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } + if (call_count == 0) + afr_dir_fop_done (frame, this); return 0; } @@ -445,16 +572,14 @@ afr_mknod_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; - int_lock = &local->internal_lock; - call_count = afr_locked_children_count (int_lock->entry_locked_nodes, - priv->child_count); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -464,13 +589,14 @@ afr_mknod_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && int_lock->entry_locked_nodes[i]) { + if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->mknod, &local->loc, local->cont.mknod.mode, local->cont.mknod.dev, - local->cont.mknod.params); + local->umask, + local->xdata_req); if (!--call_count) break; } @@ -495,15 +621,15 @@ afr_mknod_done (call_frame_t *frame, xlator_t *this) int -afr_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t dev, dict_t *params) +afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t dev, mode_t umask, dict_t *params) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -511,20 +637,20 @@ afr_mknod (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(mknod,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; loc_copy (&local->loc, loc); @@ -535,32 +661,48 @@ afr_mknod (call_frame_t *frame, xlator_t *this, } UNLOCK (&priv->read_child_lock); + local->op = GF_FOP_MKNOD; local->cont.mknod.mode = mode; local->cont.mknod.dev = dev; + local->umask = umask; if (params) - local->cont.mknod.params = dict_ref (params); - - if (loc->parent) - local->cont.mknod.parent_ino = loc->parent->ino; + local->xdata_req = dict_ref (params); local->transaction.fop = afr_mknod_wind; local->transaction.done = afr_mknod_done; local->transaction.unwind = afr_mknod_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, loc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (mknod, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL); + AFR_STACK_UNWIND (mknod, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); } return 0; @@ -576,7 +718,6 @@ afr_mkdir_unwind (call_frame_t *frame, xlator_t *this) { call_frame_t *main_frame = NULL; afr_local_t *local = NULL; - struct iatt *unwind_buf = NULL; local = frame->local; @@ -590,17 +731,13 @@ afr_mkdir_unwind (call_frame_t *frame, xlator_t *this) UNLOCK (&frame->lock); if (main_frame) { - if (local->cont.mkdir.read_child_buf.ia_ino) { - unwind_buf = &local->cont.mkdir.read_child_buf; - } else { - unwind_buf = &local->cont.mkdir.buf; - } - AFR_STACK_UNWIND (mkdir, main_frame, local->op_ret, local->op_errno, - local->cont.mkdir.inode, - unwind_buf, &local->cont.mkdir.preparent, - &local->cont.mkdir.postparent); + local->cont.dir_fop.inode, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + NULL); } return 0; @@ -611,58 +748,25 @@ int afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; int call_count = -1; int child_index = -1; - int32_t *fresh_children = NULL; - - local = frame->local; - priv = this->private; child_index = (long) cookie; LOCK (&frame->lock); { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0) - local->cont.mkdir.buf = *buf; - - if (child_index == local->read_child_index) { - local->cont.mkdir.read_child_buf = *buf; - local->cont.mkdir.preparent = *preparent; - local->cont.mkdir.postparent = *postparent; - } - - local->cont.mkdir.inode = inode; - - fresh_children = local->fresh_children; - fresh_children[local->success_count] = child_index; - local->success_count++; - } - - local->op_errno = op_errno; + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, inode, buf, + preparent, postparent, NULL, NULL); } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_set_read_ctx_from_policy (this, inode, - local->fresh_children, - local->read_child_index, - priv->read_child); - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } + if (call_count == 0) + afr_dir_fop_done (frame, this); return 0; } @@ -673,16 +777,14 @@ afr_mkdir_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; - int_lock = &local->internal_lock; - call_count = afr_locked_children_count (int_lock->entry_locked_nodes, - priv->child_count); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -692,13 +794,14 @@ afr_mkdir_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && int_lock->entry_locked_nodes[i]) { + if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->mkdir, &local->loc, local->cont.mkdir.mode, - local->cont.mkdir.params); + local->umask, + local->xdata_req); if (!--call_count) break; } @@ -722,17 +825,16 @@ afr_mkdir_done (call_frame_t *frame, xlator_t *this) return 0; } - int afr_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *params) + loc_t *loc, mode_t mode, mode_t umask, dict_t *params) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -740,20 +842,20 @@ afr_mkdir (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(mkdir,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; loc_copy (&local->loc, loc); @@ -765,31 +867,47 @@ afr_mkdir (call_frame_t *frame, xlator_t *this, UNLOCK (&priv->read_child_lock); local->cont.mkdir.mode = mode; + local->umask = umask; if (params) - local->cont.mkdir.params = dict_ref (params); - - if (loc->parent) - local->cont.mkdir.parent_ino = loc->parent->ino; + local->xdata_req = dict_ref (params); + local->op = GF_FOP_MKDIR; local->transaction.fop = afr_mkdir_wind; local->transaction.done = afr_mkdir_done; local->transaction.unwind = afr_mkdir_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, loc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (mkdir, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL); + AFR_STACK_UNWIND (mkdir, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); } return 0; @@ -805,7 +923,6 @@ afr_link_unwind (call_frame_t *frame, xlator_t *this) { call_frame_t *main_frame = NULL; afr_local_t *local = NULL; - struct iatt *unwind_buf = NULL; local = frame->local; @@ -819,17 +936,13 @@ afr_link_unwind (call_frame_t *frame, xlator_t *this) UNLOCK (&frame->lock); if (main_frame) { - if (local->cont.link.read_child_buf.ia_ino) { - unwind_buf = &local->cont.link.read_child_buf; - } else { - unwind_buf = &local->cont.link.buf; - } - AFR_STACK_UNWIND (link, main_frame, local->op_ret, local->op_errno, - local->cont.link.inode, - unwind_buf, &local->cont.link.preparent, - &local->cont.link.postparent); + local->cont.dir_fop.inode, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + NULL); } return 0; @@ -840,59 +953,25 @@ int afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; int call_count = -1; int child_index = -1; - int32_t *fresh_children = NULL; - - local = frame->local; - priv = this->private; child_index = (long) cookie; LOCK (&frame->lock); { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0) { - local->cont.link.buf = *buf; - } - - if (child_index == local->read_child_index) { - local->cont.link.read_child_buf = *buf; - local->cont.link.preparent = *preparent; - local->cont.link.postparent = *postparent; - } - - local->cont.link.inode = inode; - - fresh_children = local->fresh_children; - fresh_children[local->success_count] = child_index; - local->success_count++; - } - - local->op_errno = op_errno; + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, inode, buf, + preparent, postparent, NULL, NULL); } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_set_read_ctx_from_policy (this, inode, - local->fresh_children, - local->read_child_index, - priv->read_child); - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } + if (call_count == 0) + afr_dir_fop_done (frame, this); return 0; } @@ -903,16 +982,14 @@ afr_link_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; - int_lock = &local->internal_lock; - call_count = afr_locked_children_count (int_lock->entry_locked_nodes, - priv->child_count); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -922,12 +999,13 @@ afr_link_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && int_lock->entry_locked_nodes[i]) { - STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i, + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_link_wind_cbk, + (void *) (long) i, priv->children[i], priv->children[i]->fops->link, &local->loc, - &local->newloc); + &local->newloc, local->xdata_req); if (!--call_count) break; @@ -953,14 +1031,14 @@ afr_link_done (call_frame_t *frame, xlator_t *this) int afr_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) + loc_t *oldloc, loc_t *newloc, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -968,23 +1046,25 @@ afr_link (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(link,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; loc_copy (&local->loc, oldloc); loc_copy (&local->newloc, newloc); + if (xdata) + local->xdata_req = dict_ref (xdata); LOCK (&priv->read_child_lock); { @@ -993,30 +1073,41 @@ afr_link (call_frame_t *frame, xlator_t *this, } UNLOCK (&priv->read_child_lock); - local->cont.link.ino = oldloc->inode->ino; - - if (oldloc->parent) - local->cont.link.parent_ino = newloc->parent->ino; - + local->op = GF_FOP_LINK; local->transaction.fop = afr_link_wind; local->transaction.done = afr_link_done; local->transaction.unwind = afr_link_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, oldloc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, newloc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (oldloc->path); - local->transaction.new_basename = AFR_BASENAME (newloc->path); + local->transaction.basename = AFR_BASENAME (newloc->path); + int_lock = &local->internal_lock; - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; - op_ret = 0; + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (link, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL); + AFR_STACK_UNWIND (link, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); } return 0; @@ -1032,7 +1123,6 @@ afr_symlink_unwind (call_frame_t *frame, xlator_t *this) { call_frame_t *main_frame = NULL; afr_local_t *local = NULL; - struct iatt *unwind_buf = NULL; local = frame->local; @@ -1046,17 +1136,13 @@ afr_symlink_unwind (call_frame_t *frame, xlator_t *this) UNLOCK (&frame->lock); if (main_frame) { - if (local->cont.symlink.read_child_buf.ia_ino) { - unwind_buf = &local->cont.symlink.read_child_buf; - } else { - unwind_buf = &local->cont.symlink.buf; - } - AFR_STACK_UNWIND (symlink, main_frame, local->op_ret, local->op_errno, - local->cont.symlink.inode, - unwind_buf, &local->cont.symlink.preparent, - &local->cont.symlink.postparent); + local->cont.dir_fop.inode, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + NULL); } return 0; @@ -1067,58 +1153,25 @@ int afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; int call_count = -1; int child_index = -1; - int32_t *fresh_children = NULL; - - local = frame->local; - priv = this->private; child_index = (long) cookie; LOCK (&frame->lock); { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0) - local->cont.symlink.buf = *buf; - - if (child_index == local->read_child_index) { - local->cont.symlink.read_child_buf = *buf; - local->cont.symlink.preparent = *preparent; - local->cont.symlink.postparent = *postparent; - } - - local->cont.symlink.inode = inode; - - fresh_children = local->fresh_children; - fresh_children[local->success_count] = child_index; - local->success_count++; - } - - local->op_errno = op_errno; + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, inode, buf, + preparent, postparent, NULL, NULL); } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_set_read_ctx_from_policy (this, inode, - local->fresh_children, - local->read_child_index, - priv->read_child); - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } + if (call_count == 0) + afr_dir_fop_done (frame, this); return 0; } @@ -1129,16 +1182,14 @@ afr_symlink_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; - int_lock = &local->internal_lock; - call_count = afr_locked_children_count (int_lock->entry_locked_nodes, - priv->child_count); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -1148,14 +1199,15 @@ afr_symlink_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && int_lock->entry_locked_nodes[i]) { + if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->symlink, local->cont.symlink.linkpath, &local->loc, - local->cont.symlink.params); + local->umask, + local->xdata_req); if (!--call_count) break; @@ -1182,14 +1234,14 @@ afr_symlink_done (call_frame_t *frame, xlator_t *this) int afr_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *loc, dict_t *params) + const char *linkpath, loc_t *loc, mode_t umask, dict_t *params) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -1197,20 +1249,20 @@ afr_symlink (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(symlink,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; loc_copy (&local->loc, loc); @@ -1222,30 +1274,46 @@ afr_symlink (call_frame_t *frame, xlator_t *this, UNLOCK (&priv->read_child_lock); local->cont.symlink.linkpath = gf_strdup (linkpath); + local->umask = umask; if (params) - local->cont.symlink.params = dict_ref (params); - - if (loc->parent) - local->cont.symlink.parent_ino = loc->parent->ino; + local->xdata_req = dict_ref (params); + local->op = GF_FOP_SYMLINK; local->transaction.fop = afr_symlink_wind; local->transaction.done = afr_symlink_done; local->transaction.unwind = afr_symlink_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, loc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (symlink, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL); + AFR_STACK_UNWIND (symlink, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); } return 0; @@ -1260,7 +1328,6 @@ afr_rename_unwind (call_frame_t *frame, xlator_t *this) { call_frame_t *main_frame = NULL; afr_local_t *local = NULL; - struct iatt *unwind_buf = NULL; local = frame->local; @@ -1274,19 +1341,14 @@ afr_rename_unwind (call_frame_t *frame, xlator_t *this) UNLOCK (&frame->lock); if (main_frame) { - if (local->cont.rename.read_child_buf.ia_ino) { - unwind_buf = &local->cont.rename.read_child_buf; - } else { - unwind_buf = &local->cont.rename.buf; - } - AFR_STACK_UNWIND (rename, main_frame, local->op_ret, local->op_errno, - unwind_buf, - &local->cont.rename.preoldparent, - &local->cont.rename.postoldparent, - &local->cont.rename.prenewparent, - &local->cont.rename.postnewparent); + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + &local->cont.dir_fop.prenewparent, + &local->cont.dir_fop.postnewparent, + NULL); } return 0; @@ -1297,7 +1359,8 @@ int afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { afr_local_t * local = NULL; int call_count = -1; @@ -1311,38 +1374,22 @@ afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { if (afr_fop_failed (op_ret, op_errno) && op_errno != ENOTEMPTY) afr_transaction_fop_failed (frame, this, child_index); + local->op_errno = op_errno; + local->child_errno[child_index] = op_errno; - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - - if (buf) { - local->cont.rename.buf = *buf; - } - - local->success_count++; - } - - if (child_index == local->read_child_index) { - local->cont.rename.read_child_buf = *buf; - - local->cont.rename.preoldparent = *preoldparent; - local->cont.rename.postoldparent = *postoldparent; - local->cont.rename.prenewparent = *prenewparent; - local->cont.rename.postnewparent = *postnewparent; - } - } + if (op_ret > -1) + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, NULL, buf, + preoldparent, postoldparent, + prenewparent, postnewparent); - local->op_errno = op_errno; } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); - if (call_count == 0) { - local->transaction.unwind (frame, this); - local->transaction.resume (frame, this); - } + if (call_count == 0) + afr_dir_fop_done (frame, this); return 0; } @@ -1353,16 +1400,14 @@ afr_rename_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; - int_lock = &local->internal_lock; - call_count = afr_locked_children_count (int_lock->entry_locked_nodes, - priv->child_count); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -1372,13 +1417,13 @@ afr_rename_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && int_lock->entry_locked_nodes[i]) { + if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->rename, &local->loc, - &local->newloc); + &local->newloc, NULL); if (!--call_count) break; } @@ -1403,14 +1448,15 @@ afr_rename_done (call_frame_t *frame, xlator_t *this) int afr_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) + loc_t *oldloc, loc_t *newloc, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + int nlockee = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -1418,54 +1464,90 @@ afr_rename (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(rename,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; loc_copy (&local->loc, oldloc); loc_copy (&local->newloc, newloc); local->read_child_index = afr_inode_get_read_ctx (this, oldloc->inode, NULL); - local->cont.rename.ino = oldloc->inode->ino; - - if (oldloc->parent) - local->cont.rename.oldparent_ino = oldloc->parent->ino; - if (newloc->parent) - local->cont.rename.newparent_ino = newloc->parent->ino; - + local->op = GF_FOP_RENAME; local->transaction.fop = afr_rename_wind; local->transaction.done = afr_rename_done; local->transaction.unwind = afr_rename_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, oldloc); - afr_build_parent_loc (&local->transaction.new_parent_loc, newloc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, oldloc, + &op_errno); + if (ret) + goto out; + ret = afr_build_parent_loc (&local->transaction.new_parent_loc, newloc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (oldloc->path); local->transaction.new_basename = AFR_BASENAME (newloc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = nlockee = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->transaction.new_parent_loc, + local->transaction.new_basename, + priv->child_count); + if (ret) + goto out; + + nlockee++; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; - afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); + nlockee++; + if (local->newloc.inode && IA_ISDIR (local->newloc.inode->ia_type)) { + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->newloc, + NULL, + priv->child_count); + if (ret) + goto out; - op_ret = 0; + nlockee++; + } + qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee), + afr_entry_lockee_cmp); + int_lock->lockee_count = nlockee; + + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (rename, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL, NULL); + AFR_STACK_UNWIND (rename, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL, NULL); } return 0; @@ -1495,8 +1577,9 @@ afr_unlink_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (unlink, main_frame, local->op_ret, local->op_errno, - &local->cont.unlink.preparent, - &local->cont.unlink.postparent); + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + NULL); } return 0; @@ -1506,7 +1589,7 @@ afr_unlink_unwind (call_frame_t *frame, xlator_t *this) int afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { afr_local_t * local = NULL; int call_count = -1; @@ -1519,36 +1602,15 @@ afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (child_index == local->read_child_index) { local->read_child_returned = _gf_true; } - - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.unlink.preparent = *preparent; - local->cont.unlink.postparent = *postparent; - } - - if (child_index == local->read_child_index) { - local->cont.unlink.preparent = *preparent; - local->cont.unlink.postparent = *postparent; - } - - local->success_count++; - } - - local->op_errno = op_errno; + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, NULL, NULL, + preparent, postparent, NULL, NULL); } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } + if (call_count == 0) + afr_dir_fop_done (frame, this); return 0; } @@ -1559,16 +1621,14 @@ afr_unlink_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; - int_lock = &local->internal_lock; - call_count = afr_locked_children_count (int_lock->entry_locked_nodes, - priv->child_count); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -1578,12 +1638,13 @@ afr_unlink_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && int_lock->entry_locked_nodes[i]) { + if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->unlink, - &local->loc); + &local->loc, local->xflag, + local->xdata_req); if (!--call_count) break; @@ -1609,14 +1670,14 @@ afr_unlink_done (call_frame_t *frame, xlator_t *this) int32_t afr_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc) + loc_t *loc, int xflag, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -1624,44 +1685,62 @@ afr_unlink (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(unlink,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; loc_copy (&local->loc, loc); + local->xflag = xflag; + if (xdata) + local->xdata_req = dict_ref (xdata); - if (loc->parent) - local->cont.unlink.parent_ino = loc->parent->ino; - + local->op = GF_FOP_UNLINK; local->transaction.fop = afr_unlink_wind; local->transaction.done = afr_unlink_done; local->transaction.unwind = afr_unlink_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, loc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; - op_ret = 0; + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (unlink, frame, op_ret, op_errno, - NULL, NULL); + AFR_STACK_UNWIND (unlink, frame, -1, op_errno, + NULL, NULL, NULL); } return 0; @@ -1693,8 +1772,9 @@ afr_rmdir_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (rmdir, main_frame, local->op_ret, local->op_errno, - &local->cont.rmdir.preparent, - &local->cont.rmdir.postparent); + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + NULL); } return 0; @@ -1704,7 +1784,7 @@ afr_rmdir_unwind (call_frame_t *frame, xlator_t *this) int afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { afr_local_t * local = NULL; int call_count = -1; @@ -1718,36 +1798,22 @@ afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (child_index == read_child) { local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno) && (op_errno != ENOTEMPTY)) afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.rmdir.preparent = *preparent; - local->cont.rmdir.postparent = *postparent; - - } - - if (child_index == read_child) { - local->cont.rmdir.preparent = *preparent; - local->cont.rmdir.postparent = *postparent; - } - - local->success_count++; - } - local->op_errno = op_errno; + local->child_errno[child_index] = op_errno; + if (op_ret > -1) + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, NULL, NULL, + preparent, postparent, NULL, + NULL); + } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.unwind (frame, this); - local->transaction.resume (frame, this); - } + if (call_count == 0) + afr_dir_fop_done (frame, this); return 0; } @@ -1758,16 +1824,14 @@ afr_rmdir_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; - int_lock = &local->internal_lock; - call_count = afr_locked_children_count (int_lock->entry_locked_nodes, - priv->child_count); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -1777,12 +1841,13 @@ afr_rmdir_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && int_lock->entry_locked_nodes[i]) { + if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->rmdir, - &local->loc, local->cont.rmdir.flags); + &local->loc, local->cont.rmdir.flags, + NULL); if (!--call_count) break; @@ -1808,14 +1873,15 @@ afr_rmdir_done (call_frame_t *frame, xlator_t *this) int afr_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags) + loc_t *loc, int flags, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + int nlockee = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -1823,45 +1889,71 @@ afr_rmdir (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(rmdir,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; local->cont.rmdir.flags = flags; loc_copy (&local->loc, loc); - if (loc->parent) - local->cont.rmdir.parent_ino = loc->parent->ino; - + local->op = GF_FOP_RMDIR; local->transaction.fop = afr_rmdir_wind; local->transaction.done = afr_rmdir_done; local->transaction.unwind = afr_rmdir_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, loc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + int_lock->lockee_count = nlockee = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + nlockee++; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->loc, + NULL, + priv->child_count); + if (ret) + goto out; - op_ret = 0; + nlockee++; + qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee), + afr_entry_lockee_cmp); + int_lock->lockee_count = nlockee; + + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (rmdir, frame, op_ret, op_errno, - NULL, NULL); + AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL); } return 0; diff --git a/xlators/cluster/afr/src/afr-dir-write.h b/xlators/cluster/afr/src/afr-dir-write.h index 0290c6350..02f0a3682 100644 --- a/xlators/cluster/afr/src/afr-dir-write.h +++ b/xlators/cluster/afr/src/afr-dir-write.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __DIR_WRITE_H__ @@ -23,38 +14,34 @@ int32_t afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params); + mode_t umask, fd_t *fd, dict_t *xdata); int32_t afr_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t dev, dict_t *params); + loc_t *loc, mode_t mode, dev_t dev, mode_t umask, dict_t *xdata); int32_t afr_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *params); + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata); int32_t afr_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc); + loc_t *loc, int xflag, dict_t *xdata); int32_t afr_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags); + loc_t *loc, int flags, dict_t *xdata); int32_t afr_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc); + loc_t *oldloc, loc_t *newloc, dict_t *xdata); int32_t afr_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc); + loc_t *oldloc, loc_t *newloc, dict_t *xdata); int afr_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *oldloc, dict_t *params); - -int32_t -afr_setdents (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count); + const char *linkpath, loc_t *oldloc, mode_t umask, dict_t *params); #endif /* __DIR_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index 1258afe09..e06e3b2f2 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -58,7 +49,7 @@ int32_t afr_access_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; @@ -92,12 +83,13 @@ afr_access_cbk (call_frame_t *frame, void *cookie, (void *) (long) read_child, children[next_call_child], children[next_call_child]->fops->access, - &local->loc, local->cont.access.mask); + &local->loc, local->cont.access.mask, + NULL); } out: if (unwind) { - AFR_STACK_UNWIND (access, frame, op_ret, op_errno); + AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata); } return 0; @@ -105,15 +97,16 @@ out: int32_t -afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask) +afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) { afr_private_t *priv = NULL; xlator_t **children = NULL; int call_child = 0; afr_local_t *local = NULL; - int32_t op_ret = -1; int32_t op_errno = 0; int32_t read_child = -1; + int ret = -1; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -124,14 +117,14 @@ afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask) children = priv->children; - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; + AFR_SBRAIN_CHECK_LOC (loc, out); - op_ret = AFR_LOCAL_INIT (local, priv); - if (op_ret < 0) { - op_errno = -op_ret; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } local->fresh_children = afr_children_create (priv->child_count); if (!local->fresh_children) { @@ -142,13 +135,12 @@ afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask) read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, + ret = afr_get_call_child (this, local->child_up, read_child, local->fresh_children, &call_child, &local->cont.access.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; + if (ret < 0) { + op_errno = -ret; goto out; } @@ -159,13 +151,12 @@ afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask) (void *) (long) call_child, children[call_child], children[call_child]->fops->access, - loc, mask); + loc, mask, xdata); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (access, frame, op_ret, op_errno); - } + if (ret < 0) + AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL); return 0; } @@ -177,7 +168,7 @@ out: int32_t afr_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf) + struct iatt *buf, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; @@ -211,12 +202,12 @@ afr_stat_cbk (call_frame_t *frame, void *cookie, (void *) (long) read_child, children[next_call_child], children[next_call_child]->fops->stat, - &local->loc); + &local->loc, NULL); } out: if (unwind) { - AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf); + AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); } return 0; @@ -224,15 +215,15 @@ out: int32_t -afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) +afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; xlator_t **children = NULL; int call_child = 0; - int32_t op_ret = -1; int32_t op_errno = 0; int32_t read_child = -1; + int ret = -1; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -243,13 +234,14 @@ afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) children = priv->children; - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; - op_ret = AFR_LOCAL_INIT (local, priv); - if (op_ret < 0) { - op_errno = -op_ret; + AFR_SBRAIN_CHECK_LOC (loc, out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } local->fresh_children = afr_children_create (priv->child_count); if (!local->fresh_children) { @@ -259,29 +251,25 @@ afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, + ret = afr_get_call_child (this, local->child_up, read_child, local->fresh_children, &call_child, &local->cont.stat.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; + if (ret < 0) { + op_errno = -ret; goto out; } loc_copy (&local->loc, loc); - local->cont.stat.ino = loc->inode->ino; - STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child, children[call_child], children[call_child]->fops->stat, - loc); + loc, xdata); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); return 0; } @@ -293,7 +281,8 @@ out: int32_t afr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf) + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -327,12 +316,12 @@ afr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, (void *) (long) read_child, children[next_call_child], children[next_call_child]->fops->fstat, - local->fd); + local->fd, NULL); } out: if (unwind) { - AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf); + AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); } return 0; @@ -341,15 +330,15 @@ out: int32_t afr_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd) + fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; xlator_t **children = NULL; int call_child = 0; - int32_t op_ret = -1; int32_t op_errno = 0; int32_t read_child = 0; + int ret = -1; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -363,14 +352,14 @@ afr_fstat (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (fd->inode, out); - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; + AFR_SBRAIN_CHECK_FD (fd, out); - op_ret = AFR_LOCAL_INIT (local, priv); - if (op_ret < 0) { - op_errno = -op_ret; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } local->fresh_children = afr_children_create (priv->child_count); if (!local->fresh_children) { @@ -383,35 +372,28 @@ afr_fstat (call_frame_t *frame, xlator_t *this, - op_ret = afr_get_call_child (this, local->child_up, read_child, + ret = afr_get_call_child (this, local->child_up, read_child, local->fresh_children, &call_child, &local->cont.fstat.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; + if (ret < 0) { + op_errno = -ret; goto out; } - local->cont.fstat.ino = fd->inode->ino; local->fd = fd_ref (fd); - op_ret = afr_open_fd_fix (frame, this, _gf_false); - if (op_ret) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } + afr_open_fd_fix (fd, this); + STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child, children[call_child], children[call_child]->fops->fstat, - fd); + fd, xdata); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); return 0; } @@ -423,7 +405,7 @@ out: int32_t afr_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - const char *buf, struct iatt *sbuf) + const char *buf, struct iatt *sbuf, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; @@ -457,12 +439,13 @@ afr_readlink_cbk (call_frame_t *frame, void *cookie, children[next_call_child], children[next_call_child]->fops->readlink, &local->loc, - local->cont.readlink.size); + local->cont.readlink.size, NULL); } out: if (unwind) { - AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, buf, sbuf); + AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, buf, sbuf, + xdata); } return 0; @@ -471,15 +454,15 @@ out: int32_t afr_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size) + loc_t *loc, size_t size, dict_t *xdata) { afr_private_t *priv = NULL; xlator_t **children = NULL; int call_child = 0; afr_local_t *local = NULL; - int32_t op_ret = -1; int32_t op_errno = 0; int32_t read_child = -1; + int ret = -1; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -490,13 +473,14 @@ afr_readlink (call_frame_t *frame, xlator_t *this, children = priv->children; - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; - op_ret = AFR_LOCAL_INIT (local, priv); - if (op_ret < 0) { - op_errno = -op_ret; + AFR_SBRAIN_CHECK_LOC (loc, out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } local->fresh_children = afr_children_create (priv->child_count); if (!local->fresh_children) { @@ -505,32 +489,29 @@ afr_readlink (call_frame_t *frame, xlator_t *this, } read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, + ret = afr_get_call_child (this, local->child_up, read_child, local->fresh_children, &call_child, &local->cont.readlink.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; + if (ret < 0) { + op_errno = -ret; goto out; } loc_copy (&local->loc, loc); local->cont.readlink.size = size; - local->cont.readlink.ino = loc->inode->ino; STACK_WIND_COOKIE (frame, afr_readlink_cbk, (void *) (long) call_child, children[call_child], children[call_child]->fops->readlink, - loc, size); + loc, size, xdata); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, NULL, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -545,7 +526,7 @@ struct _xattr_key { }; -void +int __gather_xattr_keys (dict_t *dict, char *key, data_t *value, void *data) { @@ -557,13 +538,14 @@ __gather_xattr_keys (dict_t *dict, char *key, data_t *value, xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key); if (!xkey) - return; + return -1; xkey->key = key; INIT_LIST_HEAD (&xkey->list); list_add_tail (&xkey->list, list); } + return 0; } @@ -593,7 +575,7 @@ __filter_xattrs (dict_t *dict) int32_t afr_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict) + dict_t *dict, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; @@ -627,7 +609,8 @@ afr_getxattr_cbk (call_frame_t *frame, void *cookie, children[next_call_child], children[next_call_child]->fops->getxattr, &local->loc, - local->cont.getxattr.name); + local->cont.getxattr.name, + NULL); } out: @@ -635,39 +618,620 @@ out: if (op_ret >= 0 && dict) __filter_xattrs (dict); - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); + } + + return 0; +} + +int32_t +afr_getxattr_unwind (call_frame_t *frame, int op_ret, int op_errno, + dict_t *dict, dict_t *xdata) + +{ + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + +int32_t +afr_fgetxattr_clrlk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + dict_t *xattr = NULL; + char *tmp_report = NULL; + char lk_summary[1024] = {0,}; + int serz_len = 0; + int32_t callcnt = 0; + long int cky = 0; + int ret = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + cky = (long) cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) + local->child_errno[cky] = op_errno; + + if (!local->dict) + local->dict = dict_new (); + if (local->dict) { + ret = dict_get_str (dict, local->cont.getxattr.name, + &tmp_report); + if (ret) + goto unlock; + ret = dict_set_dynstr (local->dict, + children[cky]->name, + gf_strdup (tmp_report)); + if (ret) + goto unlock; + } + } +unlock: + UNLOCK (&frame->lock); + + if (!callcnt) { + xattr = dict_new (); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + ret = dict_serialize_value_with_delim (local->dict, + lk_summary, + &serz_len, '\n'); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Error serializing dictionary"); + goto unwind; + } + if (serz_len == -1) + snprintf (lk_summary, sizeof (lk_summary), + "No locks cleared."); + ret = dict_set_dynstr (xattr, local->cont.getxattr.name, + gf_strdup (lk_summary)); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Error setting dictionary"); + goto unwind; + } + + unwind: + // Updating child_errno with more recent 'events' + local->child_errno[cky] = op_errno; + op_errno = afr_resultant_errno_get (NULL, local->child_errno, + priv->child_count); + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr, + xdata); + + if (xattr) + dict_unref (xattr); + } + + return ret; +} + +int32_t +afr_getxattr_clrlk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + dict_t *xattr = NULL; + char *tmp_report = NULL; + char lk_summary[1024] = {0,}; + int serz_len = 0; + int32_t callcnt = 0; + long int cky = 0; + int ret = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + cky = (long) cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) + local->child_errno[cky] = op_errno; + + if (!local->dict) + local->dict = dict_new (); + if (local->dict) { + ret = dict_get_str (dict, local->cont.getxattr.name, + &tmp_report); + if (ret) + goto unlock; + ret = dict_set_dynstr (local->dict, + children[cky]->name, + gf_strdup (tmp_report)); + if (ret) + goto unlock; + } + } +unlock: + UNLOCK (&frame->lock); + + if (!callcnt) { + xattr = dict_new (); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + ret = dict_serialize_value_with_delim (local->dict, + lk_summary, + &serz_len, '\n'); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Error serializing dictionary"); + goto unwind; + } + if (serz_len == -1) + snprintf (lk_summary, sizeof (lk_summary), + "No locks cleared."); + ret = dict_set_dynstr (xattr, local->cont.getxattr.name, + gf_strdup (lk_summary)); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Error setting dictionary"); + goto unwind; + } + + unwind: + // Updating child_errno with more recent 'events' + local->child_errno[cky] = op_errno; + op_errno = afr_resultant_errno_get (NULL, local->child_errno, + priv->child_count); + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata); + + if (xattr) + dict_unref (xattr); + } + + return ret; +} + +/** + * node-uuid cbk uses next child querying mechanism + */ +int32_t +afr_getxattr_node_uuid_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t **children = NULL; + int unwind = 1; + int curr_call_child = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { /** query the _next_ child */ + + /** + * _current_ becomes _next_ + * If done with all childs and yet no success; give up ! + */ + curr_call_child = (int) ((long)cookie); + if (++curr_call_child == priv->child_count) + goto unwind; + + gf_log (this->name, GF_LOG_WARNING, + "op_ret (-1): Re-querying afr-child (%d/%d)", + curr_call_child, priv->child_count); + + unwind = 0; + STACK_WIND_COOKIE (frame, afr_getxattr_node_uuid_cbk, + (void *) (long) curr_call_child, + children[curr_call_child], + children[curr_call_child]->fops->getxattr, + &local->loc, + local->cont.getxattr.name, + NULL); } + unwind: + if (unwind) + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, + NULL); + return 0; } int32_t -afr_getxattr_unwind (call_frame_t *frame, - int op_ret, int op_errno, dict_t *dict) +afr_getxattr_lockinfo_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + int call_cnt = 0, len = 0; + char *lockinfo_buf = NULL; + dict_t *lockinfo = NULL, *newdict = NULL; + afr_local_t *local = NULL; + + LOCK (&frame->lock); + { + local = frame->local; + + call_cnt = --local->call_count; + + if ((op_ret < 0) || (!dict && !xdata)) { + goto unlock; + } + + if (xdata) { + if (!local->xdata_rsp) { + local->xdata_rsp = dict_new (); + if (!local->xdata_rsp) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } + + if (!dict) { + goto unlock; + } + + op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY, + (void **)&lockinfo_buf, &len); + + if (!lockinfo_buf) { + goto unlock; + } + + if (!local->dict) { + local->dict = dict_new (); + if (!local->dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } +unlock: + UNLOCK (&frame->lock); + + if (lockinfo_buf != NULL) { + lockinfo = dict_new (); + if (lockinfo == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + } else { + op_ret = dict_unserialize (lockinfo_buf, len, + &lockinfo); + + if (lockinfo && local->dict) { + dict_copy (lockinfo, local->dict); + } + } + } + + if (xdata && local->xdata_rsp) { + dict_copy (xdata, local->xdata_rsp); + } + + if (!call_cnt) { + newdict = dict_new (); + if (!newdict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } + + len = dict_serialized_length (local->dict); + if (len == 0) { + goto unwind; + } + + lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char); + if (!lockinfo_buf) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } + + op_ret = dict_serialize (local->dict, lockinfo_buf); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + } + + op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY, + (void *)lockinfo_buf, len); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + goto unwind; + } + + unwind: + AFR_STACK_UNWIND (getxattr, frame, op_ret, + op_errno, newdict, + local->xdata_rsp); + } + + dict_unref (lockinfo); + + return 0; +} +int32_t +afr_fgetxattr_lockinfo_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) { - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); + int call_cnt = 0, len = 0; + char *lockinfo_buf = NULL; + dict_t *lockinfo = NULL, *newdict = NULL; + afr_local_t *local = NULL; + + LOCK (&frame->lock); + { + local = frame->local; + + call_cnt = --local->call_count; + + if ((op_ret < 0) || (!dict && !xdata)) { + goto unlock; + } + + if (xdata) { + if (!local->xdata_rsp) { + local->xdata_rsp = dict_new (); + if (!local->xdata_rsp) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } + + if (!dict) { + goto unlock; + } + + op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY, + (void **)&lockinfo_buf, &len); + + if (!lockinfo_buf) { + goto unlock; + } + + if (!local->dict) { + local->dict = dict_new (); + if (!local->dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } +unlock: + UNLOCK (&frame->lock); + + if (lockinfo_buf != NULL) { + lockinfo = dict_new (); + if (lockinfo == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + } else { + op_ret = dict_unserialize (lockinfo_buf, len, + &lockinfo); + + if (lockinfo && local->dict) { + dict_copy (lockinfo, local->dict); + } + } + } + + if (xdata && local->xdata_rsp) { + dict_copy (xdata, local->xdata_rsp); + } + + if (!call_cnt) { + newdict = dict_new (); + if (!newdict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } + + len = dict_serialized_length (local->dict); + if (len <= 0) { + goto unwind; + } + + lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char); + if (!lockinfo_buf) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } + + op_ret = dict_serialize (local->dict, lockinfo_buf); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + } + + op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY, + (void *)lockinfo_buf, len); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + goto unwind; + } + + unwind: + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, + op_errno, newdict, + local->xdata_rsp); + } + + dict_unref (lockinfo); + return 0; } int32_t +afr_fgetxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t callcnt = 0; + int ret = 0; + char *xattr = NULL; + char *xattr_serz = NULL; + char xattr_cky[1024] = {0,}; + dict_t *nxattr = NULL; + long cky = 0; + int32_t padding = 0; + int32_t tlen = 0; + + if (!frame || !frame->local || !this) { + gf_log ("", GF_LOG_ERROR, "possible NULL deref"); + goto out; + } + + local = frame->local; + cky = (long) cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (!dict || (op_ret < 0)) + goto out; + + if (!local->dict) + local->dict = dict_new (); + + if (local->dict) { + ret = dict_get_str (dict, + local->cont.getxattr.name, + &xattr); + if (ret) + goto out; + + xattr = gf_strdup (xattr); + + (void)snprintf (xattr_cky, 1024, "%s-%ld", + local->cont.getxattr.name, cky); + ret = dict_set_dynstr (local->dict, + xattr_cky, xattr); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Cannot set xattr cookie key"); + goto out; + } + + local->cont.getxattr.xattr_len + += strlen (xattr) + 1; + } + } +out: + UNLOCK (&frame->lock); + + if (!callcnt) { + if (!local->cont.getxattr.xattr_len) + goto unwind; + + nxattr = dict_new (); + if (!nxattr) + goto unwind; + + /* extra bytes for decorations (brackets and <>'s) */ + padding += strlen (this->name) + + strlen (AFR_PATHINFO_HEADER) + 4; + local->cont.getxattr.xattr_len += (padding + 2); + + xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len, + sizeof (char), gf_common_mt_char); + + if (!xattr_serz) + goto unwind; + + /* the xlator info */ + (void) sprintf (xattr_serz, "(<"AFR_PATHINFO_HEADER"%s> ", + this->name); + + /* actual series of pathinfo */ + ret = dict_serialize_value_with_delim (local->dict, + xattr_serz + + strlen (xattr_serz), + &tlen, ' '); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Error serializing" + " dictionary"); + goto unwind; + } + + /* closing part */ + *(xattr_serz + padding + tlen) = ')'; + *(xattr_serz + padding + tlen + 1) = '\0'; + + ret = dict_set_dynstr (nxattr, local->cont.getxattr.name, + xattr_serz); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo" + " key in dict"); + + unwind: + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, nxattr, + xdata); + + if (nxattr) + dict_unref (nxattr); + } + + return ret; +} + +int32_t afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict) + dict_t *dict, dict_t *xdata) { - afr_local_t *local = NULL; - int32_t callcnt = 0; - int ret = 0; - char *pathinfo = NULL; - char *pathinfo_serz = NULL; - char pathinfo_cky[1024] = {0,}; - dict_t *xattr = NULL; - long cky = 0; - int32_t padding = 0; - int32_t tlen = 0; + afr_local_t *local = NULL; + int32_t callcnt = 0; + int ret = 0; + char *xattr = NULL; + char *xattr_serz = NULL; + char xattr_cky[1024] = {0,}; + dict_t *nxattr = NULL; + long cky = 0; + int32_t padding = 0; + int32_t tlen = 0; if (!frame || !frame->local || !this) { - gf_log (this->name, GF_LOG_ERROR, "possible NULL deref"); + gf_log ("", GF_LOG_ERROR, "possible NULL deref"); goto out; } @@ -685,90 +1249,222 @@ afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, local->dict = dict_new (); if (local->dict) { - ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); + ret = dict_get_str (dict, + local->cont.getxattr.name, + &xattr); if (ret) goto out; - pathinfo = gf_strdup (pathinfo); + xattr = gf_strdup (xattr); - snprintf (pathinfo_cky, 1024, "%s-%ld", GF_XATTR_PATHINFO_KEY, cky); - ret = dict_set_dynstr (local->dict, pathinfo_cky, pathinfo); + (void)snprintf (xattr_cky, 1024, "%s-%ld", + local->cont.getxattr.name, cky); + ret = dict_set_dynstr (local->dict, + xattr_cky, xattr); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo cookie key"); + gf_log (this->name, GF_LOG_ERROR, + "Cannot set xattr cookie key"); goto out; } - local->cont.getxattr.pathinfo_len += strlen (pathinfo) + 1; + local->cont.getxattr.xattr_len += strlen (xattr) + 1; } } out: UNLOCK (&frame->lock); if (!callcnt) { - if (!local->cont.getxattr.pathinfo_len) + if (!local->cont.getxattr.xattr_len) goto unwind; - xattr = dict_new (); - if (!xattr) + nxattr = dict_new (); + if (!nxattr) goto unwind; /* extra bytes for decorations (brackets and <>'s) */ - padding = strlen (this->name) + strlen (AFR_PATHINFO_HEADER) + 4; - local->cont.getxattr.pathinfo_len += (padding + 2); + padding += strlen (this->name) + strlen (AFR_PATHINFO_HEADER) + 4; + local->cont.getxattr.xattr_len += (padding + 2); - pathinfo_serz = GF_CALLOC (local->cont.getxattr.pathinfo_len, sizeof (char), - gf_common_mt_char); + xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len, + sizeof (char), gf_common_mt_char); - if (!pathinfo_serz) + if (!xattr_serz) goto unwind; /* the xlator info */ - sprintf (pathinfo_serz, "(<"AFR_PATHINFO_HEADER"%s> ", this->name); + (void) sprintf (xattr_serz, "(<"AFR_PATHINFO_HEADER"%s> ", + this->name); /* actual series of pathinfo */ - ret = dict_serialize_value_with_delim (local->dict, pathinfo_serz + strlen (pathinfo_serz), + ret = dict_serialize_value_with_delim (local->dict, + xattr_serz + strlen (xattr_serz), &tlen, ' '); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Error serializing dictionary"); + gf_log (this->name, GF_LOG_ERROR, "Error serializing" + " dictionary"); goto unwind; } /* closing part */ - *(pathinfo_serz + padding + tlen) = ')'; - *(pathinfo_serz + padding + tlen + 1) = '\0'; + *(xattr_serz + padding + tlen) = ')'; + *(xattr_serz + padding + tlen + 1) = '\0'; - ret = dict_set_dynstr (xattr, GF_XATTR_PATHINFO_KEY, pathinfo_serz); + ret = dict_set_dynstr (nxattr, local->cont.getxattr.name, + xattr_serz); if (ret) - gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo key in dict"); + gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo" + " key in dict"); unwind: - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr); + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, nxattr, + xdata); - if (local->dict) - dict_unref (local->dict); - - if (xattr) - dict_unref (xattr); + if (nxattr) + dict_unref (nxattr); } return ret; } +static int +afr_aggregate_stime_xattr (dict_t *this, char *key, data_t *value, void *data) +{ + int ret = 0; + + if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) + ret = gf_get_min_stime (THIS, data, key, value); + + return ret; +} + int32_t -afr_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) +afr_common_getxattr_stime_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t callcnt = 0; + + if (!frame || !frame->local || !this) { + gf_log ("", GF_LOG_ERROR, "possible NULL deref"); + goto out; + } + + local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (!dict || (op_ret < 0)) { + local->op_errno = op_errno; + goto cleanup; + } + + if (!local->dict) + local->dict = dict_copy_with_ref (dict, NULL); + else + dict_foreach (dict, afr_aggregate_stime_xattr, + local->dict); + local->op_ret = 0; + } + +cleanup: + UNLOCK (&frame->lock); + + if (!callcnt) { + AFR_STACK_UNWIND (getxattr, frame, local->op_ret, + local->op_errno, local->dict, xdata); + } + +out: + return 0; +} + + +static gf_boolean_t +afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk, + gf_boolean_t is_fgetxattr) +{ + gf_boolean_t is_spl = _gf_true; + + GF_ASSERT (cbk); + if (!cbk) { + is_spl = _gf_false; + goto out; + } + + if (!strcmp (name, GF_XATTR_PATHINFO_KEY)) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_pathinfo_cbk; + } else { + *cbk = afr_getxattr_pathinfo_cbk; + } + } else if (!strncmp (name, GF_XATTR_CLRLK_CMD, + strlen (GF_XATTR_CLRLK_CMD))) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_clrlk_cbk; + } else { + *cbk = afr_getxattr_clrlk_cbk; + } + } else if (!strncmp (name, GF_XATTR_LOCKINFO_KEY, + strlen (GF_XATTR_LOCKINFO_KEY))) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_lockinfo_cbk; + } else { + *cbk = afr_getxattr_lockinfo_cbk; + } + } else if (fnmatch (GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) { + *cbk = afr_common_getxattr_stime_cbk; + } else { + is_spl = _gf_false; + } + +out: + return is_spl; +} + +static void +afr_getxattr_frm_all_children (xlator_t *this, call_frame_t *frame, + const char *name, loc_t *loc, + fop_getxattr_cbk_t cbk) { - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - xlator_list_t *trav = NULL; - xlator_t **sub_volumes = NULL; - int i = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - int32_t read_child = -1; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t **children = NULL; + int i = 0; + priv = this->private; + children = priv->children; + + local = frame->local; + local->call_count = priv->child_count; + + for (i = 0; i < priv->child_count; i++) { + STACK_WIND_COOKIE (frame, cbk, + (void *) (long) i, + children[i], children[i]->fops->getxattr, + loc, name, NULL); + } + return; +} + +int32_t +afr_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) +{ + afr_private_t *priv = NULL; + xlator_t **children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + xlator_list_t *trav = NULL; + xlator_t **sub_volumes = NULL; + int i = 0; + int32_t op_errno = 0; + int32_t read_child = -1; + int ret = -1; + fop_getxattr_cbk_t cbk = NULL; + int afr_xtime_gauge[MCNT_MAX] = {0,}; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -779,48 +1475,108 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, children = priv->children; - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; + AFR_SBRAIN_CHECK_LOC (loc, out); - op_ret = AFR_LOCAL_INIT (local, priv); - if (op_ret < 0) { - op_errno = -op_ret; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } loc_copy (&local->loc, loc); - if (name) - local->cont.getxattr.name = gf_strdup (name); + if (!name) + goto no_name; + + local->cont.getxattr.name = gf_strdup (name); + + if (!strncmp (name, AFR_XATTR_PREFIX, + strlen (AFR_XATTR_PREFIX))) { + gf_log (this->name, GF_LOG_INFO, + "%s: no data present for key %s", + loc->path, name); + op_errno = ENODATA; + goto out; + } + if ((strcmp (GF_XATTR_MARKER_KEY, name) == 0) + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { + + local->marker.call_count = priv->child_count; + sub_volumes = alloca ( priv->child_count * sizeof (xlator_t *)); + for (i = 0, trav = this->children; trav ; + trav = trav->next, i++) { + + *(sub_volumes + i) = trav->xlator; + } + + if (cluster_getmarkerattr (frame, this, loc, name, + local, afr_getxattr_unwind, + sub_volumes, + priv->child_count, + MARKER_UUID_TYPE, + marker_uuid_default_gauge, + priv->vol_uuid)) { - if (name) { - if (!strncmp (name, AFR_XATTR_PREFIX, - strlen (AFR_XATTR_PREFIX))) { gf_log (this->name, GF_LOG_INFO, - "%s: no data present for key %s", + "%s: failed to get marker attr (%s)", loc->path, name); - op_errno = ENODATA; + op_errno = EINVAL; goto out; } - if ((strcmp (GF_XATTR_MARKER_KEY, name) == 0) - && (-1 == frame->root->pid)) { + return 0; + } + + /* + * if we are doing getxattr with pathinfo as the key then we + * collect information from all childs + */ + if (afr_is_special_xattr (name, &cbk, 0)) { + afr_getxattr_frm_all_children (this, frame, name, + loc, cbk); + return 0; + } + + if (XATTR_IS_NODE_UUID (name)) { + i = 0; + STACK_WIND_COOKIE (frame, afr_getxattr_node_uuid_cbk, + (void *) (long) i, + children[i], + children[i]->fops->getxattr, + loc, name, xdata); + return 0; + } + + if (*priv->vol_uuid) { + if ((match_uuid_local (name, priv->vol_uuid) == 0) + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { local->marker.call_count = priv->child_count; - sub_volumes = alloca ( priv->child_count * sizeof (xlator_t *)); + sub_volumes = alloca ( priv->child_count + * sizeof (xlator_t *)); for (i = 0, trav = this->children; trav ; trav = trav->next, i++) { *(sub_volumes + i) = trav->xlator; + } - if (cluster_getmarkerattr (frame, this, loc, name, - local, afr_getxattr_unwind, + /* don't err out on getting ENOTCONN (brick down) + * from a subset of the bricks + */ + memcpy (afr_xtime_gauge, marker_xtime_default_gauge, + sizeof (afr_xtime_gauge)); + afr_xtime_gauge[MCNT_NOTFOUND] = 0; + afr_xtime_gauge[MCNT_ENOTCONN] = 0; + if (cluster_getmarkerattr (frame, this, loc, + name, local, + afr_getxattr_unwind, sub_volumes, priv->child_count, - MARKER_UUID_TYPE, + MARKER_XTIME_TYPE, + afr_xtime_gauge, priv->vol_uuid)) { - gf_log (this->name, GF_LOG_INFO, "%s: failed to get marker attr (%s)", loc->path, name); @@ -830,65 +1586,187 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, return 0; } + } - /* - * if we are doing getxattr with pathinfo as the key then we - * collect information from all childs - */ - if (strncmp (name, GF_XATTR_PATHINFO_KEY, - strlen (GF_XATTR_PATHINFO_KEY)) == 0) { - - local->call_count = priv->child_count; - for (i = 0; i < priv->child_count; i++) { - STACK_WIND_COOKIE (frame, afr_getxattr_pathinfo_cbk, - (void *) (long) i, - children[i], children[i]->fops->getxattr, - loc, name); - } +no_name: + local->fresh_children = afr_children_create (priv->child_count); + if (!local->fresh_children) { + op_errno = ENOMEM; + goto out; + } - return 0; - } + read_child = afr_inode_get_read_ctx (this, loc->inode, + local->fresh_children); + ret = afr_get_call_child (this, local->child_up, read_child, + local->fresh_children, + &call_child, + &local->cont.getxattr.last_index); + if (ret < 0) { + op_errno = -ret; + goto out; + } - if (*priv->vol_uuid) { - if ((match_uuid_local (name, priv->vol_uuid) == 0) - && (-1 == frame->root->pid)) { + STACK_WIND_COOKIE (frame, afr_getxattr_cbk, + (void *) (long) call_child, + children[call_child], + children[call_child]->fops->getxattr, + loc, name, xdata); - local->marker.call_count = priv->child_count; + ret = 0; +out: + if (ret < 0) + AFR_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); + return 0; +} - sub_volumes = alloca ( priv->child_count * sizeof (xlator_t *)); - for (i = 0, trav = this->children; trav ; - trav = trav->next, i++) { +/* {{{ fgetxattr */ - *(sub_volumes + i) = trav->xlator; - } +int32_t +afr_fgetxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + int unwind = 1; + int32_t *last_index = NULL; + int32_t next_call_child = -1; + int32_t read_child = -1; + int32_t *fresh_children = NULL; - if (cluster_getmarkerattr (frame, this, loc, - name, local, - afr_getxattr_unwind, - sub_volumes, - priv->child_count, - MARKER_XTIME_TYPE, - priv->vol_uuid)) { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to get marker attr (%s)", - loc->path, name); - op_errno = EINVAL; - goto out; - } + priv = this->private; + children = priv->children; - return 0; - } - } + local = frame->local; + + read_child = (long) cookie; + + if (op_ret == -1) { + last_index = &local->cont.getxattr.last_index; + fresh_children = local->fresh_children; + next_call_child = afr_next_call_child (fresh_children, + local->child_up, + priv->child_count, + last_index, read_child); + if (next_call_child < 0) + goto out; + + unwind = 0; + STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, + (void *) (long) read_child, + children[next_call_child], + children[next_call_child]->fops->fgetxattr, + local->fd, + local->cont.getxattr.name, + NULL); + } + +out: + if (unwind) { + if (op_ret >= 0 && dict) + __filter_xattrs (dict); + + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, + xdata); + } + + return 0; +} + +int32_t +afr_fgetxattr_unwind (call_frame_t *frame, + int op_ret, int op_errno, dict_t *dict, dict_t *xdata) + +{ + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + +static void +afr_fgetxattr_frm_all_children (xlator_t *this, call_frame_t *frame, + const char *name, fd_t *fd, + fop_fgetxattr_cbk_t cbk) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t **children = NULL; + int i = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + local->call_count = priv->child_count; + + for (i = 0; i < priv->child_count; i++) { + STACK_WIND_COOKIE (frame, cbk, + (void *) (long) i, + children[i], children[i]->fops->fgetxattr, + fd, name, NULL); } + return; +} + +int32_t +afr_fgetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + afr_private_t *priv = NULL; + xlator_t **children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + int32_t read_child = -1; + fop_fgetxattr_cbk_t cbk = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + AFR_SBRAIN_CHECK_FD (fd, out); + + AFR_LOCAL_ALLOC_OR_GOTO (local, out); + frame->local = local; + + op_ret = afr_local_init (local, priv, &op_errno); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + local->fd = fd_ref (fd); + if (name) + local->cont.getxattr.name = gf_strdup (name); + + /* pathinfo gets handled only in getxattr(), but we need to handle + * lockinfo. + * If we are doing fgetxattr with lockinfo as the key then we + * collect information from all children. + */ + if (afr_is_special_xattr (name, &cbk, 1)) { + afr_fgetxattr_frm_all_children (this, frame, name, + fd, cbk); + return 0; + } + + local->fresh_children = afr_children_create (priv->child_count); if (!local->fresh_children) { op_errno = ENOMEM; goto out; } - read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); + read_child = afr_inode_get_read_ctx (this, fd->inode, + local->fresh_children); op_ret = afr_get_call_child (this, local->child_up, read_child, local->fresh_children, &call_child, @@ -899,16 +1777,17 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, goto out; } - STACK_WIND_COOKIE (frame, afr_getxattr_cbk, + STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, (void *) (long) call_child, children[call_child], - children[call_child]->fops->getxattr, - loc, name); + children[call_child]->fops->fgetxattr, + fd, name, xdata); op_ret = 0; out: if (op_ret == -1) { - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, NULL); + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, NULL, + NULL); } return 0; } @@ -934,7 +1813,7 @@ int32_t afr_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, int32_t count, struct iatt *buf, - struct iobref *iobref) + struct iobref *iobref, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; @@ -975,13 +1854,15 @@ afr_readv_cbk (call_frame_t *frame, void *cookie, children[next_call_child], children[next_call_child]->fops->readv, local->fd, local->cont.readv.size, - local->cont.readv.offset); + local->cont.readv.offset, + local->cont.readv.flags, + NULL); } out: if (unwind) { AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, - vector, count, buf, iobref); + vector, count, buf, iobref, xdata); } return 0; @@ -990,15 +1871,15 @@ out: int32_t afr_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset) + fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; xlator_t ** children = NULL; int call_child = 0; - int32_t op_ret = -1; int32_t op_errno = 0; int32_t read_child = -1; + int ret = -1; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -1008,13 +1889,14 @@ afr_readv (call_frame_t *frame, xlator_t *this, priv = this->private; children = priv->children; - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; - op_ret = AFR_LOCAL_INIT (local, priv); - if (op_ret < 0) { - op_errno = -op_ret; + AFR_SBRAIN_CHECK_FD (fd, out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } local->fresh_children = afr_children_create (priv->child_count); if (!local->fresh_children) { @@ -1023,39 +1905,34 @@ afr_readv (call_frame_t *frame, xlator_t *this, } read_child = afr_inode_get_read_ctx (this, fd->inode, local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, + ret = afr_get_call_child (this, local->child_up, read_child, local->fresh_children, &call_child, &local->cont.readv.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; + if (ret < 0) { + op_errno = -ret; goto out; } local->fd = fd_ref (fd); - local->cont.readv.ino = fd->inode->ino; local->cont.readv.size = size; local->cont.readv.offset = offset; + local->cont.readv.flags = flags; + + afr_open_fd_fix (fd, this); - op_ret = afr_open_fd_fix (frame, this, _gf_false); - if (op_ret) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } STACK_WIND_COOKIE (frame, afr_readv_cbk, (void *) (long) call_child, children[call_child], children[call_child]->fops->readv, - fd, size, offset); + fd, size, offset, flags, xdata); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, NULL, 0, NULL, - NULL); + if (ret < 0) { + AFR_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, + NULL, NULL); } return 0; } diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h index 5479cfbd5..e4091a793 100644 --- a/xlators/cluster/afr/src/afr-inode-read.h +++ b/xlators/cluster/afr/src/afr-inode-read.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __INODE_READ_H__ @@ -22,26 +13,30 @@ int32_t afr_access (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t mask); + loc_t *loc, int32_t mask, dict_t *xdata); int32_t afr_stat (call_frame_t *frame, xlator_t *this, - loc_t *loc); + loc_t *loc, dict_t *xdata); int32_t afr_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd); + fd_t *fd, dict_t *xdata); int32_t afr_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size); + loc_t *loc, size_t size, dict_t *xdata); int32_t afr_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset); + fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata); int32_t afr_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name); + loc_t *loc, const char *name, dict_t *xdata); + +int32_t +afr_fgetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata); #endif /* __INODE_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index c6d7b5f22..c1ec69a55 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -48,45 +39,151 @@ #include "afr-transaction.h" #include "afr-self-heal-common.h" +void +__inode_write_fop_cbk (call_frame_t *frame, int child_index, int read_child, + xlator_t *this, int32_t *op_ret, int32_t *op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (afr_fop_failed (*op_ret, *op_errno)) { + local->child_errno[child_index] = *op_errno; + + switch (local->op) { + case GF_FOP_TRUNCATE: + case GF_FOP_FTRUNCATE: + if (*op_errno != EFBIG) + afr_transaction_fop_failed (frame, this, + child_index); + break; + default: + afr_transaction_fop_failed (frame, this, child_index); + break; + } + local->op_errno = *op_errno; + goto out; + } + + if ((local->success_count == 0) || (read_child == child_index)) { + local->op_ret = *op_ret; + if (prebuf) + local->cont.inode_wfop.prebuf = *prebuf; + if (postbuf) + local->cont.inode_wfop.postbuf = *postbuf; + } + + local->success_count++; +out: + return; +} + /* {{{ writev */ -int +void +afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame) +{ + afr_local_t *src_local = NULL; + afr_local_t *dst_local = NULL; + + src_local = src_frame->local; + dst_local = dst_frame->local; + + dst_local->op_ret = src_local->op_ret; + dst_local->op_errno = src_local->op_errno; + dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf; + dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf; +} + +void afr_writev_unwind (call_frame_t *frame, xlator_t *this) { afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + local = frame->local; + + AFR_STACK_UNWIND (writev, frame, + local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); +} + +call_frame_t* +afr_transaction_detach_fop_frame (call_frame_t *frame) +{ + afr_local_t * local = NULL; + call_frame_t *fop_frame = NULL; local = frame->local; LOCK (&frame->lock); { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; + fop_frame = local->transaction.main_frame; local->transaction.main_frame = NULL; } UNLOCK (&frame->lock); - if (main_frame) { - AFR_STACK_UNWIND (writev, main_frame, - local->op_ret, local->op_errno, - &local->cont.writev.prebuf, - &local->cont.writev.postbuf); + return fop_frame; +} + +int +afr_transaction_writev_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *fop_frame = NULL; + + fop_frame = afr_transaction_detach_fop_frame (frame); + + if (fop_frame) { + afr_writev_copy_outvars (frame, fop_frame); + afr_writev_unwind (fop_frame, this); } return 0; } +static void +afr_writev_handle_short_writes (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + /* + * We already have the best case result of the writev calls staged + * as the return value. Any writev that returns some value less + * than the best case is now out of sync, so mark the fop as + * failed. Note that fops that have returned with errors have + * already been marked as failed. + */ + for (i = 0; i < priv->child_count; i++) { + if ((!local->replies[i].valid) || + (local->replies[i].op_ret == -1)) + continue; + + if (local->replies[i].op_ret < local->op_ret) + afr_transaction_fop_failed(frame, this, i); + } +} int afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = NULL; + afr_private_t *priv = NULL; + call_frame_t *fop_frame = NULL; int child_index = (long) cookie; int call_count = -1; int read_child = 0; + int ret = 0; + uint32_t open_fd_count = 0; + uint32_t write_is_append = 0; local = frame->local; + priv = this->private; read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); @@ -96,32 +193,81 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.writev.prebuf = *prebuf; - local->cont.writev.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.writev.prebuf = *prebuf; - local->cont.writev.postbuf = *postbuf; + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + + + /* stage the best case return value for unwind */ + if ((local->success_count == 0) || (op_ret > local->op_ret)) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + + if (op_ret != -1) { + if (xdata) { + ret = dict_get_uint32 (xdata, + GLUSTERFS_OPEN_FD_COUNT, + &open_fd_count); + if ((ret == 0) && + (open_fd_count > local->open_fd_count)) { + local->open_fd_count = open_fd_count; + local->update_open_fd_count = _gf_true; + } + + write_is_append = 0; + ret = dict_get_uint32 (xdata, + GLUSTERFS_WRITE_IS_APPEND, + &write_is_append); + if (ret || !write_is_append) + local->append_write = _gf_false; } - } - local->op_errno = op_errno; + } } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) { - local->transaction.unwind (frame, this); - local->transaction.resume (frame, this); + if (local->update_open_fd_count) + afr_handle_open_fd_count (frame, this); + + if (!local->stable_write && !local->append_write) + /* An appended write removes the necessity to + fsync() the file. This is because self-heal + has the logic to check for larger file when + the xattrs are not reliably pointing at + a stale file. + */ + afr_fd_report_unstable_write (this, local->fd); + + afr_writev_handle_short_writes (frame, this); + if (afr_any_fops_failed (local, priv)) { + //Don't unwind until post-op is complete + local->transaction.resume (frame, this); + } else { + /* + * Generally inode-write fops do transaction.unwind then + * transaction.resume, but writev needs to make sure that + * delayed post-op frame is placed in fdctx before unwind + * happens. This prevents the race of flush doing the + * changelog wakeup first in fuse thread and then this + * writev placing its delayed post-op frame in fdctx. + * This helps flush make sure all the delayed post-ops are + * completed. + */ + + fop_frame = afr_transaction_detach_fop_frame (frame); + afr_writev_copy_outvars (frame, fop_frame); + local->transaction.resume (frame, this); + afr_writev_unwind (fop_frame, this); + } } return 0; } @@ -131,16 +277,16 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - afr_internal_lock_t *int_lock = NULL; int i = 0; int call_count = -1; + dict_t *xdata = NULL; + GF_UNUSED int ret = 0; local = frame->local; priv = this->private; - int_lock = &local->internal_lock; - call_count = afr_locked_children_count (int_lock->inode_locked_nodes, - priv->child_count); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -148,9 +294,31 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this) } local->call_count = call_count; + local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), + gf_afr_mt_reply_t); + if (!local->replies) { + local->op_ret = -1; + local->op_errno = ENOMEM; + local->transaction.unwind(frame, this); + local->transaction.resume(frame, this); + return 0; + } + + xdata = dict_new (); + if (xdata) { + ret = dict_set_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, + sizeof (uint32_t)); + ret = dict_set_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND, + 0); + /* Set append_write to be true speculatively. If on any + server it turns not be true, we unset it in the + callback. + */ + local->append_write = _gf_true; + } for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && int_lock->inode_locked_nodes[i]) { + if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, (void *) (long) i, priv->children[i], @@ -159,13 +327,18 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this) local->cont.writev.vector, local->cont.writev.count, local->cont.writev.offset, - local->cont.writev.iobref); + local->cont.writev.flags, + local->cont.writev.iobref, + xdata); if (!--call_count) break; } } + if (xdata) + dict_unref (xdata); + return 0; } @@ -205,7 +378,7 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) } transaction_frame->local = local; - frame->local = NULL; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local->op = GF_FOP_WRITE; @@ -213,10 +386,17 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) local->transaction.fop = afr_writev_wind; local->transaction.done = afr_writev_done; - local->transaction.unwind = afr_writev_unwind; + local->transaction.unwind = afr_transaction_writev_unwind; local->transaction.main_frame = frame; if (local->fd->flags & O_APPEND) { + /* + * Backend vfs ignores the 'offset' for append mode fd so + * locking just the region provided for the writev does not + * give consistency gurantee. The actual write may happen at a + * completely different range than the one provided by the + * offset, len in the fop. So lock the entire file. + */ local->transaction.start = 0; local->transaction.len = 0; } else { @@ -225,156 +405,91 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) local->cont.writev.count); } - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } op_ret = 0; out: - if (op_ret == -1) { + if (op_ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL); + AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL, NULL); } return 0; } -static int -afr_prepare_loc (call_frame_t *frame, fd_t *fd) +static void +afr_trigger_open_fd_self_heal (fd_t *fd, xlator_t *this) { - afr_local_t *local = NULL; - char *name = NULL; - char *path = NULL; - int ret = 0; - - if ((!fd) || (!fd->inode)) - return -1; - - local = frame->local; - ret = inode_path (fd->inode, NULL, (char **)&path); - if (ret <= 0) { - gf_log (frame->this->name, GF_LOG_DEBUG, - "Unable to get path for gfid: %s", - uuid_utoa (fd->inode->gfid)); - return -1; - } - - if (local->loc.path) { - if (strcmp (path, local->loc.path)) - gf_log (frame->this->name, GF_LOG_DEBUG, - "overwriting old loc->path %s with %s", - local->loc.path, path); - GF_FREE ((char *)local->loc.path); - } - local->loc.path = path; - - name = strrchr (local->loc.path, '/'); - if (name) - name++; - local->loc.name = name; - - if (local->loc.inode) { - inode_unref (local->loc.inode); - } - local->loc.inode = inode_ref (fd->inode); - - if (local->loc.parent) { - inode_unref (local->loc.parent); + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + char *reason = NULL; + int32_t op_errno = 0; + int ret = 0; + + if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid args: " + "fd: %p, inode: %p", fd, + fd ? fd->inode : NULL); + goto out; } - local->loc.parent = inode_parent (local->loc.inode, 0, NULL); - - return 0; -} - -afr_fd_paused_call_t* -afr_paused_call_create (call_frame_t *frame) -{ - afr_local_t *local = NULL; - afr_fd_paused_call_t *paused_call = NULL; + frame = create_frame (this, this->ctx->pool); + if (!frame) + goto out; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; - GF_ASSERT (local->fop_call_continue); - - paused_call = GF_CALLOC (1, sizeof (*paused_call), - gf_afr_fd_paused_call_t); - if (paused_call) { - INIT_LIST_HEAD (&paused_call->call_list); - paused_call->frame = frame; - } - - return paused_call; -} - -static int -afr_pause_fd_fop (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx) -{ - afr_fd_paused_call_t *paused_call = NULL; - int ret = 0; - - paused_call = afr_paused_call_create (frame); - if (paused_call) - list_add (&paused_call->call_list, &fd_ctx->paused_calls); - else - ret = -ENOMEM; - - return ret; -} + ret = afr_local_init (local, this->private, &op_errno); + if (ret < 0) + goto out; -static void -afr_trigger_open_fd_self_heal (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - inode_t *inode = NULL; - char *reason = NULL; + local->loc.inode = inode_ref (fd->inode); + ret = loc_path (&local->loc, NULL); + if (ret < 0) + goto out; - local = frame->local; sh = &local->self_heal; - inode = local->fd->inode; - - sh->do_missing_entry_self_heal = _gf_true; - sh->do_gfid_self_heal = _gf_true; - sh->do_data_self_heal = _gf_true; + sh->do_metadata_self_heal = _gf_true; + if (fd->inode->ia_type == IA_IFREG) + sh->do_data_self_heal = _gf_true; + else if (fd->inode->ia_type == IA_IFDIR) + sh->do_entry_self_heal = _gf_true; reason = "subvolume came online"; - afr_launch_self_heal (frame, this, inode, _gf_true, inode->ia_type, - reason, NULL, NULL); + afr_launch_self_heal (frame, this, fd->inode, _gf_true, + fd->inode->ia_type, reason, NULL, NULL); + return; +out: + AFR_STACK_DESTROY (frame); } -int -afr_open_fd_fix (call_frame_t *frame, xlator_t *this, gf_boolean_t pause_fop) -{ - int ret = 0; - int i = 0; - afr_fd_ctx_t *fd_ctx = NULL; - gf_boolean_t need_self_heal = _gf_false; - int *need_open = NULL; - int need_open_count = 0; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - gf_boolean_t fop_continue = _gf_true; +void +afr_open_fd_fix (fd_t *fd, xlator_t *this) +{ + int ret = 0; + int i = 0; + afr_fd_ctx_t *fd_ctx = NULL; + gf_boolean_t need_self_heal = _gf_false; + int *need_open = NULL; + size_t need_open_count = 0; + afr_private_t *priv = NULL; - local = frame->local; priv = this->private; - GF_ASSERT (local->fd); - if (pause_fop) - GF_ASSERT (local->fop_call_continue); - - ret = afr_prepare_loc (frame, local->fd); - if (ret < 0) { - //File does not exist we cant open it. - ret = 0; + if (!afr_is_fd_fixable (fd)) goto out; - } - fd_ctx = afr_fd_ctx_get (local->fd, this); - if (!fd_ctx) { - ret = -EINVAL; - goto unlock; - } + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + goto out; - LOCK (&local->fd->lock); + LOCK (&fd->lock); { if (fd_ctx->up_count < priv->up_count) { need_self_heal = _gf_true; @@ -382,67 +497,44 @@ afr_open_fd_fix (call_frame_t *frame, xlator_t *this, gf_boolean_t pause_fop) fd_ctx->down_count = priv->down_count; } + need_open = alloca (priv->child_count * sizeof (*need_open)); for (i = 0; i < priv->child_count; i++) { - if ((fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED) && - local->child_up[i]) { - fd_ctx->opened_on[i] = AFR_FD_OPENING; - if (!need_open) - need_open = GF_CALLOC (priv->child_count, - sizeof (*need_open), - gf_afr_mt_int32_t); - need_open[i] = 1; - need_open_count++; - } else if (pause_fop && local->child_up[i] && - (fd_ctx->opened_on[i] == AFR_FD_OPENING)) { - local->fop_paused = _gf_true; - } - } + need_open[i] = 0; + if (fd_ctx->opened_on[i] != AFR_FD_NOT_OPENED) + continue; + + if (!priv->child_up[i]) + continue; - if (local->fop_paused) { - GF_ASSERT (pause_fop); - gf_log (this->name, GF_LOG_INFO, "Pause fd %p", - local->fd); - ret = afr_pause_fd_fop (frame, this, fd_ctx); - if (ret) - goto unlock; - fop_continue = _gf_false; + fd_ctx->opened_on[i] = AFR_FD_OPENING; + + need_open[i] = 1; + need_open_count++; } } -unlock: - UNLOCK (&local->fd->lock); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Failed to fix fd for %s", - local->loc.path); - fop_continue = _gf_false; + UNLOCK (&fd->lock); + if (ret) goto out; - } if (need_self_heal) - afr_trigger_open_fd_self_heal (frame, this); + afr_trigger_open_fd_self_heal (fd, this); if (!need_open_count) goto out; - gf_log (this->name, GF_LOG_INFO, "Opening fd %p", local->fd); - afr_fix_open (frame, this, fd_ctx, need_open_count, need_open); - fop_continue = _gf_false; + afr_fix_open (this, fd, need_open_count, need_open); out: - if (need_open) - GF_FREE (need_open); - if (fop_continue && local->fop_call_continue) - local->fop_call_continue (frame, this); - return ret; + return; } int afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref) + uint32_t flags, struct iobref *iobref, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; int ret = -1; - int op_ret = -1; int op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -451,36 +543,41 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; goto out; } - frame->local = local; + QUORUM_CHECK(writev,out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; local->cont.writev.vector = iov_dup (vector, count); local->cont.writev.count = count; local->cont.writev.offset = offset; - local->cont.writev.ino = fd->inode->ino; + local->cont.writev.flags = flags; local->cont.writev.iobref = iobref_ref (iobref); local->fd = fd_ref (fd); - local->fop_call_continue = afr_do_writev; - ret = afr_open_fd_fix (frame, this, _gf_true); - if (ret) { - op_errno = -ret; - goto out; - } + /* detect here, but set it in writev_wind_cbk *after* the unstable + write is performed + */ + local->stable_write = !!((fd->flags|flags)&(O_SYNC|O_DSYNC)); - op_ret = 0; + afr_open_fd_fix (fd, this); + + afr_do_writev (frame, this); + + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -509,8 +606,9 @@ afr_truncate_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, local->op_errno, - &local->cont.truncate.prebuf, - &local->cont.truncate.postbuf); + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); } return 0; @@ -520,17 +618,14 @@ afr_truncate_unwind (call_frame_t *frame, xlator_t *this) int afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = NULL; - afr_private_t * priv = NULL; int child_index = (long) cookie; int read_child = 0; int call_count = -1; - int need_unwind = 0; local = frame->local; - priv = this->private; read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); @@ -540,38 +635,22 @@ afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno) && op_errno != EFBIG) - afr_transaction_fop_failed (frame, this, child_index); - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.truncate.prebuf = *prebuf; - local->cont.truncate.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.truncate.prebuf = *prebuf; - local->cont.truncate.postbuf = *postbuf; - } - - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if (prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; } - local->op_errno = op_errno; + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); } UNLOCK (&frame->lock); - if (need_unwind) - local->transaction.unwind (frame, this); - call_count = afr_frame_return (frame); if (call_count == 0) { + if (local->stable_write && afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); + local->transaction.resume (frame, this); } @@ -584,16 +663,14 @@ afr_truncate_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; - int_lock = &local->internal_lock; - call_count = afr_locked_children_count (int_lock->inode_locked_nodes, - priv->child_count); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -601,15 +678,17 @@ afr_truncate_wind (call_frame_t *frame, xlator_t *this) } local->call_count = call_count; + local->stable_write = _gf_true; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && int_lock->inode_locked_nodes[i]) { + if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->truncate, &local->loc, - local->cont.truncate.offset); + local->cont.truncate.offset, + NULL); if (!--call_count) break; @@ -637,13 +716,12 @@ afr_truncate_done (call_frame_t *frame, xlator_t *this) int afr_truncate (call_frame_t *frame, xlator_t *this, - loc_t *loc, off_t offset) + loc_t *loc, off_t offset, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_ret = -1; int op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -652,25 +730,22 @@ afr_truncate (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(truncate,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; - - local->op_ret = -1; local->cont.truncate.offset = offset; - local->cont.truncate.ino = loc->inode->ino; local->transaction.fop = afr_truncate_wind; local->transaction.done = afr_truncate_done; @@ -682,14 +757,18 @@ afr_truncate (call_frame_t *frame, xlator_t *this, local->transaction.start = offset; local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (truncate, frame, op_ret, op_errno, NULL, NULL); + AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); } return 0; @@ -720,8 +799,9 @@ afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, local->op_errno, - &local->cont.ftruncate.prebuf, - &local->cont.ftruncate.postbuf); + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); } return 0; } @@ -730,17 +810,14 @@ afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) int afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = NULL; - afr_private_t * priv = NULL; int child_index = (long) cookie; int call_count = -1; - int need_unwind = 0; int read_child = 0; local = frame->local; - priv = this->private; read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); @@ -750,38 +827,22 @@ afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.ftruncate.prebuf = *prebuf; - local->cont.ftruncate.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.ftruncate.prebuf = *prebuf; - local->cont.ftruncate.postbuf = *postbuf; - } - - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if (prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; } - local->op_errno = op_errno; + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); } UNLOCK (&frame->lock); - if (need_unwind) - local->transaction.unwind (frame, this); - call_count = afr_frame_return (frame); if (call_count == 0) { + if (local->stable_write && afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); + local->transaction.resume (frame, this); } @@ -794,16 +855,14 @@ afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; - int_lock = &local->internal_lock; - call_count = afr_locked_children_count (int_lock->inode_locked_nodes, - priv->child_count); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -811,14 +870,17 @@ afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) } local->call_count = call_count; + local->stable_write = _gf_true; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && int_lock->inode_locked_nodes[i]) { + if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->ftruncate, - local->fd, local->cont.ftruncate.offset); + local->fd, + local->cont.ftruncate.offset, + NULL); if (!--call_count) break; @@ -873,14 +935,19 @@ afr_do_ftruncate (call_frame_t *frame, xlator_t *this) local->transaction.start = local->cont.ftruncate.offset; local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } op_ret = 0; out: - if (op_ret == -1) { + if (op_ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, NULL); + AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, + NULL, NULL); } return 0; @@ -889,13 +956,12 @@ out: int afr_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset) + fd_t *fd, off_t offset, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_ret = -1; int op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -904,34 +970,33 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - ret = AFR_LOCAL_INIT (local, priv); - - if (ret < 0) { - op_errno = -ret; + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; goto out; } + QUORUM_CHECK(ftruncate,out); - frame->local = local; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; local->cont.ftruncate.offset = offset; - local->cont.ftruncate.ino = fd->inode->ino; local->fd = fd_ref (fd); - local->fop_call_continue = afr_do_ftruncate; - ret = afr_open_fd_fix (frame, this, _gf_true); - if (ret) { - op_errno = -ret; - goto out; - } + afr_open_fd_fix (fd, this); - op_ret = 0; + afr_do_ftruncate (frame, this); + + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, NULL); + AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); } return 0; @@ -960,8 +1025,9 @@ afr_setattr_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, local->op_errno, - &local->cont.setattr.preop_buf, - &local->cont.setattr.postop_buf); + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); } return 0; @@ -971,7 +1037,7 @@ afr_setattr_unwind (call_frame_t *frame, xlator_t *this) int afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) + struct iatt *preop, struct iatt *postop, dict_t *xdata) { afr_local_t * local = NULL; afr_private_t * priv = NULL; @@ -991,29 +1057,14 @@ afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.setattr.preop_buf = *preop; - local->cont.setattr.postop_buf = *postop; - } - - if (child_index == read_child) { - local->cont.setattr.preop_buf = *preop; - local->cont.setattr.postop_buf = *postop; - } + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, preop, postop, + xdata); - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; } - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1035,16 +1086,14 @@ afr_setattr_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; - int_lock = &local->internal_lock; - call_count = afr_locked_children_count (int_lock->inode_locked_nodes, - priv->child_count); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -1054,14 +1103,15 @@ afr_setattr_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && int_lock->inode_locked_nodes[i]) { + if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->setattr, &local->loc, &local->cont.setattr.in_buf, - local->cont.setattr.valid); + local->cont.setattr.valid, + NULL); if (!--call_count) break; @@ -1089,13 +1139,12 @@ afr_setattr_done (call_frame_t *frame, xlator_t *this) int afr_setattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct iatt *buf, int32_t valid) + loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_ret = -1; int op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -1104,24 +1153,20 @@ afr_setattr (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(setattr,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; - - local->op_ret = -1; - - local->cont.setattr.ino = loc->inode->ino; local->cont.setattr.in_buf = *buf; local->cont.setattr.valid = valid; @@ -1136,14 +1181,18 @@ afr_setattr (call_frame_t *frame, xlator_t *this, local->transaction.start = LLONG_MAX - 1; local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setattr, frame, op_ret, op_errno, NULL, NULL); + AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); } return 0; @@ -1170,8 +1219,9 @@ afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, local->op_errno, - &local->cont.fsetattr.preop_buf, - &local->cont.fsetattr.postop_buf); + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); } return 0; @@ -1181,7 +1231,7 @@ afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this) int afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) + struct iatt *preop, struct iatt *postop, dict_t *xdata) { afr_local_t * local = NULL; afr_private_t * priv = NULL; @@ -1201,29 +1251,14 @@ afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.fsetattr.preop_buf = *preop; - local->cont.fsetattr.postop_buf = *postop; - } - - if (child_index == read_child) { - local->cont.fsetattr.preop_buf = *preop; - local->cont.fsetattr.postop_buf = *postop; - } + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, preop, postop, + xdata); - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; } - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1245,16 +1280,14 @@ afr_fsetattr_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; - int_lock = &local->internal_lock; - call_count = afr_locked_children_count (int_lock->inode_locked_nodes, - priv->child_count); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -1264,14 +1297,15 @@ afr_fsetattr_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && int_lock->inode_locked_nodes[i]) { + if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->fsetattr, local->fd, &local->cont.fsetattr.in_buf, - local->cont.fsetattr.valid); + local->cont.fsetattr.valid, + NULL); if (!--call_count) break; @@ -1298,13 +1332,12 @@ afr_fsetattr_done (call_frame_t *frame, xlator_t *this) int afr_fsetattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iatt *buf, int32_t valid) + fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_ret = -1; int op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -1313,24 +1346,26 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this, priv = this->private; + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + + QUORUM_CHECK(fsetattr,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); - transaction_frame->local = local; + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - local->op_ret = -1; - - local->cont.fsetattr.ino = fd->inode->ino; local->cont.fsetattr.in_buf = *buf; local->cont.fsetattr.valid = valid; @@ -1340,25 +1375,24 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this, local->fd = fd_ref (fd); - op_ret = afr_open_fd_fix (transaction_frame, this, _gf_false); - if (ret) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } + afr_open_fd_fix (fd, this); local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, NULL, NULL); + AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); } return 0; @@ -1386,38 +1420,34 @@ afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (setxattr, main_frame, - local->op_ret, local->op_errno) - } + local->op_ret, local->op_errno, + NULL); + } return 0; } int afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int need_unwind = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; local = frame->local; priv = this->private; LOCK (&frame->lock); { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->child_count) { - need_unwind = 1; - } + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); + if (local->success_count == priv->child_count) { + need_unwind = 1; } - - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1437,18 +1467,16 @@ afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int afr_setxattr_wind (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_internal_lock_t *int_lock = NULL; - int call_count = -1; - int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; local = frame->local; priv = this->private; - int_lock = &local->internal_lock; - call_count = afr_locked_children_count (int_lock->inode_locked_nodes, - priv->child_count); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -1458,14 +1486,15 @@ afr_setxattr_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && int_lock->inode_locked_nodes[i]) { + if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->setxattr, &local->loc, local->cont.setxattr.dict, - local->cont.setxattr.flags); + local->cont.setxattr.flags, + NULL); if (!--call_count) break; @@ -1479,7 +1508,7 @@ afr_setxattr_wind (call_frame_t *frame, xlator_t *this) int afr_setxattr_done (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = frame->local; + afr_local_t *local = frame->local; local->transaction.unwind (frame, this); @@ -1490,37 +1519,40 @@ afr_setxattr_done (call_frame_t *frame, xlator_t *this) int afr_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags) + loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + int ret = -1; + int op_errno = EINVAL; - VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; + GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, + op_errno, out); - ALLOC_OR_GOTO (local, afr_local_t, out); + GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, + op_errno, out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + QUORUM_CHECK(setxattr,out); transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - transaction_frame->local = local; + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - local->op_ret = -1; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; local->cont.setxattr.dict = dict_ref (dict); local->cont.setxattr.flags = flags; @@ -1535,14 +1567,211 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, local->transaction.start = LLONG_MAX - 1; local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + } + + return 0; +} + +/* {{{ fsetxattr */ + + +int +afr_fsetxattr_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (fsetxattr, main_frame, + local->op_ret, local->op_errno, + NULL); + } + return 0; +} + + +int +afr_fsetxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); + if (local->success_count == priv->child_count) { + need_unwind = 1; + } + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fsetxattr, + local->fd, + local->cont.fsetxattr.dict, + local->cont.fsetxattr.flags, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_fsetxattr_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + +int +afr_fsetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = EINVAL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, + op_errno, out); + + GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, + op_errno, out); + + priv = this->private; + + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + + QUORUM_CHECK(fsetxattr,out); + + AFR_LOCAL_ALLOC_OR_GOTO (local, out); + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + goto out; + } + + transaction_frame->local = local; + + local->op_ret = -1; + + local->cont.fsetxattr.dict = dict_ref (dict); + local->cont.fsetxattr.flags = flags; + + local->transaction.fop = afr_fsetxattr_wind; + local->transaction.done = afr_fsetxattr_done; + local->transaction.unwind = afr_fsetxattr_unwind; + + local->fd = fd_ref (fd); + + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; + + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno); + AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); } return 0; @@ -1550,6 +1779,7 @@ out: /* }}} */ + /* {{{ removexattr */ @@ -1571,38 +1801,34 @@ afr_removexattr_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (removexattr, main_frame, - local->op_ret, local->op_errno) - } + local->op_ret, local->op_errno, + NULL); + } return 0; } int afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int need_unwind = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; local = frame->local; priv = this->private; LOCK (&frame->lock); { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); + if (local->success_count == priv->wait_count) { + need_unwind = 1; } - - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1624,16 +1850,14 @@ afr_removexattr_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; - int_lock = &local->internal_lock; - call_count = afr_locked_children_count (int_lock->inode_locked_nodes, - priv->child_count); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -1643,13 +1867,14 @@ afr_removexattr_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && int_lock->inode_locked_nodes[i]) { + if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->removexattr, &local->loc, - local->cont.removexattr.name); + local->cont.removexattr.name, + NULL); if (!--call_count) break; @@ -1675,7 +1900,192 @@ afr_removexattr_done (call_frame_t *frame, xlator_t *this) int afr_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) + loc_t *loc, const char *name, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (this, out); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", + name, op_errno, out); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", + name, op_errno, out); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + priv = this->private; + + QUORUM_CHECK(removexattr,out); + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + op_errno = ENOMEM; + goto out; + } + + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + local->cont.removexattr.name = gf_strdup (name); + + local->transaction.fop = afr_removexattr_wind; + local->transaction.done = afr_removexattr_done; + local->transaction.unwind = afr_removexattr_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; + + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); + } + + return 0; +} + +/* ffremovexattr */ +int +afr_fremovexattr_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (fremovexattr, main_frame, + local->op_ret, local->op_errno, + NULL); + } + return 0; +} + + +int +afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fremovexattr, + local->fd, + local->cont.removexattr.name, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_fremovexattr_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; @@ -1684,21 +2094,33 @@ afr_removexattr (call_frame_t *frame, xlator_t *this, int op_ret = -1; int op_errno = 0; - VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", + name, op_errno, out); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", + name, op_errno, out); + + VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); priv = this->private; + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + + QUORUM_CHECK(fremovexattr, out); transaction_frame = copy_frame (frame); if (!transaction_frame) { goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (local, out); - ret = AFR_LOCAL_INIT (local, priv); + ret = afr_local_init (local, priv, &op_errno); if (ret < 0) { op_errno = -ret; goto out; @@ -1710,25 +2132,730 @@ afr_removexattr (call_frame_t *frame, xlator_t *this, local->cont.removexattr.name = gf_strdup (name); - local->transaction.fop = afr_removexattr_wind; - local->transaction.done = afr_removexattr_done; - local->transaction.unwind = afr_removexattr_unwind; + local->transaction.fop = afr_fremovexattr_wind; + local->transaction.done = afr_fremovexattr_done; + local->transaction.unwind = afr_fremovexattr_unwind; - loc_copy (&local->loc, loc); + local->fd = fd_ref (fd); local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + op_ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + op_ret = 0; +out: + if (op_ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, NULL); + } + + return 0; +} + +static int +afr_fallocate_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, + local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); + } + return 0; +} + +static int +afr_fallocate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + int read_child = 0; + + local = frame->local; + priv = this->private; + + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + + LOCK (&frame->lock); + { + if (child_index == read_child) { + local->read_child_returned = _gf_true; + } + + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); + + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; + } + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + +static int +afr_fallocate_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fallocate, + local->fd, + local->cont.fallocate.mode, + local->cont.fallocate.offset, + local->cont.fallocate.len, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + +static int +afr_fallocate_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + +static int +afr_do_fallocate (call_frame_t *frame, xlator_t *this) +{ + call_frame_t * transaction_frame = NULL; + afr_local_t * local = NULL; + int op_ret = -1; + int op_errno = 0; + + local = frame->local; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + goto out; + } + + transaction_frame->local = local; + frame->local = NULL; + + local->op = GF_FOP_FALLOCATE; + + local->transaction.fop = afr_fallocate_wind; + local->transaction.done = afr_fallocate_done; + local->transaction.unwind = afr_fallocate_unwind; + + local->transaction.main_frame = frame; + + local->transaction.start = local->cont.fallocate.offset; + local->transaction.len = 0; + + /* fallocate can modify the file size */ + op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + op_ret = 0; +out: + if (op_ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fallocate, frame, op_ret, op_errno, NULL, + NULL, NULL); + } + + return 0; +} + +int +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + QUORUM_CHECK(fallocate,out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + local->cont.fallocate.mode = mode; + local->cont.fallocate.offset = offset; + local->cont.fallocate.len = len; + + local->fd = fd_ref (fd); + + afr_open_fd_fix (fd, this); + + afr_do_fallocate (frame, this); + + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ discard */ + +static int +afr_discard_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (discard, main_frame, local->op_ret, + local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); + } + return 0; +} + +static int +afr_discard_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + int read_child = 0; + + local = frame->local; + priv = this->private; + + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + + LOCK (&frame->lock); + { + if (child_index == read_child) { + local->read_child_returned = _gf_true; + } + + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); + + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; + } + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + +static int +afr_discard_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->discard, + local->fd, + local->cont.discard.offset, + local->cont.discard.len, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + +static int +afr_discard_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + +static int +afr_do_discard (call_frame_t *frame, xlator_t *this) +{ + call_frame_t * transaction_frame = NULL; + afr_local_t * local = NULL; + int op_ret = -1; + int op_errno = 0; + + local = frame->local; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + goto out; + } + + transaction_frame->local = local; + frame->local = NULL; + + local->op = GF_FOP_DISCARD; + + local->transaction.fop = afr_discard_wind; + local->transaction.done = afr_discard_done; + local->transaction.unwind = afr_discard_unwind; + + local->transaction.main_frame = frame; + + local->transaction.start = local->cont.discard.offset; + local->transaction.len = 0; + + op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } op_ret = 0; out: - if (op_ret == -1) { + if (op_ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (removexattr, frame, op_ret, op_errno); + AFR_STACK_UNWIND (discard, frame, op_ret, op_errno, NULL, + NULL, NULL); + } + + return 0; +} + +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + QUORUM_CHECK(discard, out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + local->cont.discard.offset = offset; + local->cont.discard.len = len; + + local->fd = fd_ref (fd); + + afr_open_fd_fix (fd, this); + + afr_do_discard(frame, this); + + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + } + + return 0; +} + + +/* {{{ zerofill */ + +static int +afr_zerofill_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (zerofill, main_frame, local->op_ret, + local->op_errno, + &local->cont.zerofill.prebuf, + &local->cont.zerofill.postbuf, + NULL); + } + return 0; +} + +static int +afr_zerofill_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + int read_child = 0; + + local = frame->local; + priv = this->private; + + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + + LOCK (&frame->lock); + { + if (child_index == read_child) { + local->read_child_returned = _gf_true; + } + + if (afr_fop_failed (op_ret, op_errno)) { + afr_transaction_fop_failed (frame, this, child_index); + } + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.zerofill.prebuf = *prebuf; + local->cont.zerofill.postbuf = *postbuf; + } + + if (child_index == read_child) { + local->cont.zerofill.prebuf = *prebuf; + local->cont.zerofill.postbuf = *postbuf; + } + + local->success_count++; + + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; + } + } + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) { + local->transaction.unwind (frame, this); + } + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + +static int +afr_zerofill_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->zerofill, + local->fd, + local->cont.zerofill.offset, + local->cont.zerofill.len, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + +static int +afr_zerofill_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + +static int +afr_do_zerofill(call_frame_t *frame, xlator_t *this) +{ + call_frame_t *transaction_frame = NULL; + afr_local_t *local = NULL; + int op_ret = -1; + int op_errno = 0; + + local = frame->local; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + goto out; + } + + transaction_frame->local = local; + frame->local = NULL; + + local->op = GF_FOP_ZEROFILL; + + local->transaction.fop = afr_zerofill_wind; + local->transaction.done = afr_zerofill_done; + local->transaction.unwind = afr_zerofill_unwind; + + local->transaction.main_frame = frame; + + local->transaction.start = local->cont.zerofill.offset; + local->transaction.len = 0; + + op_ret = afr_transaction (transaction_frame, this, + AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + op_ret = 0; +out: + if (op_ret < 0) { + if (transaction_frame) { + AFR_STACK_DESTROY (transaction_frame); + } + AFR_STACK_UNWIND (zerofill, frame, op_ret, op_errno, NULL, + NULL, NULL); } return 0; } + +int +afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + QUORUM_CHECK(zerofill, out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) { + goto out; + } + local->cont.zerofill.offset = offset; + local->cont.zerofill.len = len; + + local->fd = fd_ref (fd); + + afr_open_fd_fix (fd, this); + + afr_do_zerofill(frame, this); + + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) { + AFR_STACK_DESTROY (transaction_frame); + } + AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, + NULL, NULL); + } + + return 0; +} + +/* }}} */ + + diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h index f9aa7bd36..8e93ca44a 100644 --- a/xlators/cluster/afr/src/afr-inode-write.h +++ b/xlators/cluster/afr/src/afr-inode-write.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __INODE_WRITE_H__ @@ -22,51 +13,70 @@ int32_t afr_chmod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode); + loc_t *loc, mode_t mode, dict_t *xdata); int32_t afr_chown (call_frame_t *frame, xlator_t *this, - loc_t *loc, uid_t uid, gid_t gid); + loc_t *loc, uid_t uid, gid_t gid, dict_t *xdata); int afr_fchown (call_frame_t *frame, xlator_t *this, - fd_t *fd, uid_t uid, gid_t gid); + fd_t *fd, uid_t uid, gid_t gid, dict_t *xdata); int32_t afr_fchmod (call_frame_t *frame, xlator_t *this, - fd_t *fd, mode_t mode); + fd_t *fd, mode_t mode, dict_t *xdata); int32_t -afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, +afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref); + uint32_t flags, struct iobref *iobref, dict_t *xdata); int32_t afr_truncate (call_frame_t *frame, xlator_t *this, - loc_t *loc, off_t offset); + loc_t *loc, off_t offset, dict_t *xdata); int32_t afr_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset); + fd_t *fd, off_t offset, dict_t *xdata); int32_t afr_utimens (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct timespec tv[2]); + loc_t *loc, struct timespec tv[2], dict_t *xdata); int afr_setattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct iatt *buf, int32_t valid); + loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata); int afr_fsetattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iatt *buf, int32_t valid); + fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata); int32_t afr_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags); + loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata); + +int32_t +afr_fsetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata); int32_t afr_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name); + loc_t *loc, const char *name, dict_t *xdata); + +int32_t +afr_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata); +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata); + +int +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata); + +int +afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata); #endif /* __INODE_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index c8dd8b635..060d78f35 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include "dict.h" @@ -31,8 +22,69 @@ #define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path */ #define LOCKED_LOWER 0x2 /* for lower path */ +#define AFR_TRACE_INODELK_IN(frame, this, params ...) \ + do { \ + afr_private_t *_priv = this->private; \ + if (!_priv->inodelk_trace) \ + break; \ + afr_trace_inodelk_in (frame, this, params); \ + } while (0); + +#define AFR_TRACE_INODELK_OUT(frame, this, params ...) \ + do { \ + afr_private_t *_priv = this->private; \ + if (!_priv->inodelk_trace) \ + break; \ + afr_trace_inodelk_out (frame, this, params); \ + } while (0); + +#define AFR_TRACE_ENTRYLK_IN(frame, this, params ...) \ + do { \ + afr_private_t *_priv = this->private; \ + if (!_priv->entrylk_trace) \ + break; \ + afr_trace_entrylk_in (frame, this, params); \ + } while (0); + +#define AFR_TRACE_ENTRYLK_OUT(frame, this, params ...) \ + do { \ + afr_private_t *_priv = this->private; \ + if (!_priv->entrylk_trace) \ + break; \ + afr_trace_entrylk_out (frame, this, params); \ + } while (0); + int -afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index); +afr_entry_lockee_cmp (const void *l1, const void *l2) +{ + const afr_entry_lockee_t *r1 = l1; + const afr_entry_lockee_t *r2 = l2; + int ret = 0; + uuid_t gfid1 = {0}; + uuid_t gfid2 = {0}; + + loc_gfid ((loc_t*)&r1->loc, gfid1); + loc_gfid ((loc_t*)&r2->loc, gfid2); + ret = uuid_compare (gfid1, gfid2); + /*Entrylks with NULL basename are the 'smallest'*/ + if (ret == 0) { + if (!r1->basename) + return -1; + if (!r2->basename) + return 1; + ret = strcmp (r1->basename, r2->basename); + } + + if (ret <= 0) + return -1; + else + return 1; +} + +int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index); + +static int +afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this); static uint64_t afr_lock_number = 1; @@ -57,12 +109,13 @@ afr_set_lock_number (call_frame_t *frame, xlator_t *this) } void -afr_set_lk_owner (call_frame_t *frame, xlator_t *this) +afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner) { gf_log (this->name, GF_LOG_TRACE, "Setting lk-owner=%llu", - (unsigned long long) (unsigned long)frame->root); - frame->root->lk_owner = (uint64_t) (unsigned long)frame->root; + (unsigned long long) (unsigned long)lk_owner); + + set_lk_owner_from_ptr (&frame->root->lk_owner, lk_owner); } static int @@ -98,16 +151,9 @@ internal_lock_count (call_frame_t *frame, xlator_t *this) local = frame->local; priv = this->private; - if (local->fd) { - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && local->fd_open_on[i]) - ++call_count; - } - } else { - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) - ++call_count; - } + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) + ++call_count; } return call_count; @@ -115,7 +161,7 @@ internal_lock_count (call_frame_t *frame, xlator_t *this) static void afr_print_inodelk (char *str, int size, int cmd, - struct gf_flock *flock, uint64_t owner) + struct gf_flock *flock, gf_lkowner_t *owner) { char *cmd_str = NULL; char *type_str = NULL; @@ -163,11 +209,11 @@ afr_print_inodelk (char *str, int size, int cmd, } snprintf (str, size, "lock=INODELK, cmd=%s, type=%s, " - "start=%llu, len=%llu, pid=%llu, lk-owner=%llu", + "start=%llu, len=%llu, pid=%llu, lk-owner=%s", cmd_str, type_str, (unsigned long long) flock->l_start, (unsigned long long) flock->l_len, (unsigned long long) flock->l_pid, - (unsigned long long) owner); + lkowner_utoa (owner)); } @@ -183,11 +229,11 @@ afr_print_lockee (char *str, int size, loc_t *loc, fd_t *fd, void afr_print_entrylk (char *str, int size, const char *basename, - uint64_t owner) + gf_lkowner_t *owner) { - snprintf (str, size, "Basename=%s, lk-owner=%llu", + snprintf (str, size, "Basename=%s, lk-owner=%s", basename ? basename : "<nul>", - (unsigned long long)owner); + lkowner_utoa (owner)); } static void @@ -241,27 +287,20 @@ afr_set_lock_call_type (afr_lock_call_type_t lock_call_type, } static void -afr_trace_inodelk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type, +afr_trace_inodelk_out (call_frame_t *frame, xlator_t *this, + afr_lock_call_type_t lock_call_type, afr_lock_op_type_t lk_op_type, struct gf_flock *flock, int op_ret, int op_errno, int32_t child_index) { - xlator_t *this = NULL; afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; - afr_private_t *priv = NULL; char lockee[256]; char lock_call_type_str[256]; char verdict[16]; - this = THIS; local = frame->local; int_lock = &local->internal_lock; - priv = this->private; - - if (!priv->inodelk_trace) { - return; - } afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); @@ -270,39 +309,31 @@ afr_trace_inodelk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type, afr_print_verdict (op_ret, op_errno, verdict); gf_log (this->name, GF_LOG_INFO, - "[%s %s] [%s] Lockee={%s} Number={%llu}", + "[%s %s] [%s] lk-owner=%s Lockee={%s} Number={%llu}", lock_call_type_str, lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", - verdict, - lockee, + verdict, lkowner_utoa (&frame->root->lk_owner), lockee, (unsigned long long) int_lock->lock_number); } static void -afr_trace_inodelk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type, +afr_trace_inodelk_in (call_frame_t *frame, xlator_t *this, + afr_lock_call_type_t lock_call_type, afr_lock_op_type_t lk_op_type, struct gf_flock *flock, int32_t cmd, int32_t child_index) { - xlator_t *this = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; char lock[256]; char lockee[256]; char lock_call_type_str[256]; - this = THIS; local = frame->local; int_lock = &local->internal_lock; - priv = this->private; - if (!priv->inodelk_trace) { - return; - } - - afr_print_inodelk (lock, 256, cmd, flock, frame->root->lk_owner); + afr_print_inodelk (lock, 256, cmd, flock, &frame->root->lk_owner); afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); @@ -317,20 +348,21 @@ afr_trace_inodelk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type, } static void -afr_trace_entrylk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type, +afr_trace_entrylk_in (call_frame_t *frame, xlator_t *this, + afr_lock_call_type_t lock_call_type, afr_lock_op_type_t lk_op_type, const char *basename, - int32_t child_index) + int32_t cookie) { - xlator_t *this = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; afr_private_t *priv = NULL; + int child_index = 0; + int lockee_no = 0; char lock[256]; char lockee[256]; char lock_call_type_str[256]; - this = THIS; local = frame->local; int_lock = &local->internal_lock; priv = this->private; @@ -338,36 +370,41 @@ afr_trace_entrylk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type, if (!priv->entrylk_trace) { return; } + lockee_no = cookie / priv->child_count; + child_index = cookie % priv->child_count; - afr_print_entrylk (lock, 256, basename, frame->root->lk_owner); - afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); + afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); + afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, + child_index); afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); gf_log (this->name, GF_LOG_INFO, - "[%s %s] Lock={%s} Lockee={%s} Number={%llu}", + "[%s %s] Lock={%s} Lockee={%s} Number={%llu}, Cookie={%d}", lock_call_type_str, lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST", lock, lockee, - (unsigned long long) int_lock->lock_number); + (unsigned long long) int_lock->lock_number, + cookie); } static void -afr_trace_entrylk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type, - afr_lock_op_type_t lk_op_type, const char *basename, int op_ret, - int op_errno, int32_t child_index) +afr_trace_entrylk_out (call_frame_t *frame, xlator_t *this, + afr_lock_call_type_t lock_call_type, + afr_lock_op_type_t lk_op_type, const char *basename, + int op_ret, int op_errno, int32_t cookie) { - xlator_t *this = NULL; afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; + int lockee_no = 0; + int child_index = 0; char lock[256]; char lockee[256]; char lock_call_type_str[256]; char verdict[16]; - this = THIS; local = frame->local; int_lock = &local->internal_lock; priv = this->private; @@ -375,20 +412,25 @@ afr_trace_entrylk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type, if (!priv->entrylk_trace) { return; } + lockee_no = cookie / priv->child_count; + child_index = cookie % priv->child_count; - afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); + afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); + afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, + child_index); afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); afr_print_verdict (op_ret, op_errno, verdict); gf_log (this->name, GF_LOG_INFO, - "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu}", + "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu} Cookie={%d}", lock_call_type_str, lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", verdict, lock, lockee, - (unsigned long long) int_lock->lock_number); + (unsigned long long) int_lock->lock_number, + cookie); } @@ -441,6 +483,47 @@ is_afr_lock_transaction (afr_local_t *local) return ret; } +int +afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local, + loc_t *loc, char *basename, int child_count) +{ + int ret = -1; + + loc_copy (&lockee->loc, loc); + lockee->basename = (basename)? gf_strdup (basename): NULL; + if (basename && !lockee->basename) + goto out; + + lockee->locked_count = 0; + lockee->locked_nodes = GF_CALLOC (child_count, + sizeof (*lockee->locked_nodes), + gf_afr_mt_afr_node_character); + + if (!lockee->locked_nodes) + goto out; + + ret = 0; +out: + return ret; + +} + +void +afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock) +{ + int i = 0; + + for (i = 0; i < int_lock->lockee_count; i++) { + loc_wipe (&int_lock->lockee[i].loc); + if (int_lock->lockee[i].basename) + GF_FREE (int_lock->lockee[i].basename); + if (int_lock->lockee[i].locked_nodes) + GF_FREE (int_lock->lockee[i].locked_nodes); + } + + return; +} + static int initialize_entrylk_variables (call_frame_t *frame, xlator_t *this) { @@ -458,8 +541,13 @@ initialize_entrylk_variables (call_frame_t *frame, xlator_t *this) int_lock->lock_op_ret = -1; int_lock->lock_op_errno = 0; - for (i = 0; i < priv->child_count; i++) { - int_lock->entry_locked_nodes[i] = 0; + for (i = 0; i < AFR_LOCKEE_COUNT_MAX; i++) { + if (!int_lock->lockee[i].locked_nodes) + break; + int_lock->lockee[i].locked_count = 0; + memset (int_lock->lockee[i].locked_nodes, 0, + sizeof (*int_lock->lockee[i].locked_nodes) * + priv->child_count); } return 0; @@ -471,19 +559,23 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; afr_private_t *priv = NULL; - int i = 0; + afr_inodelk_t *inodelk = NULL; priv = this->private; local = frame->local; int_lock = &local->internal_lock; - int_lock->inodelk_lock_count = 0; - int_lock->lock_op_ret = -1; - int_lock->lock_op_errno = 0; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - for (i = 0; i < priv->child_count; i++) { - int_lock->inode_locked_nodes[i] = 0; - } + inodelk->lock_count = 0; + int_lock->lk_attempted_count = 0; + int_lock->lock_op_ret = -1; + int_lock->lock_op_errno = 0; + + memset (inodelk->locked_nodes, 0, + sizeof (*inodelk->locked_nodes) * priv->child_count); + memset (int_lock->locked_nodes, 0, + sizeof (*int_lock->locked_nodes) * priv->child_count); return 0; } @@ -493,7 +585,7 @@ lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2) { int ret = 0; - ret = strcmp (l1->path, l2->path); + ret = uuid_compare (l1->inode->gfid, l2->inode->gfid); if (ret == 0) ret = strcmp (b1, b2); @@ -505,6 +597,18 @@ lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2) } int +afr_lockee_locked_nodes_count (afr_internal_lock_t *int_lock) +{ + int call_count = 0; + int i = 0; + + for (i = 0; i < int_lock->lockee_count; i++) + call_count += int_lock->lockee[i].locked_count; + + return call_count; +} + +int afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) { @@ -522,7 +626,7 @@ afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) /* FIXME: What if UNLOCK fails */ static int32_t afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; @@ -548,33 +652,37 @@ afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this, static int32_t afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; int32_t child_index = (long)cookie; + afr_private_t *priv = NULL; local = frame->local; int_lock = &local->internal_lock; - afr_trace_inodelk_out (frame, AFR_INODELK_TRANSACTION, + AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION, AFR_UNLOCK_OP, NULL, op_ret, op_errno, child_index); + priv = this->private; + if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { - gf_log (this->name, GF_LOG_ERROR, - "%s: unlock failed on %d, reason: %s", - local->loc.path, child_index, strerror (op_errno)); + gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on subvolume %s " + "with lock owner %s", local->loc.path, + priv->children[child_index]->name, + lkowner_utoa (&frame->root->lk_owner)); } - int_lock->inode_locked_nodes[child_index] &= LOCKED_NO; - - if (op_ret == 1) { + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + inodelk->locked_nodes[child_index] &= LOCKED_NO; + if (local->transaction.eager_lock) local->transaction.eager_lock[child_index] = 0; - } - afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno); + afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, xdata); return 0; @@ -584,11 +692,12 @@ static int afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; struct gf_flock flock = {0,}; struct gf_flock full_flock = {0,}; - struct gf_flock *flock_use = &flock; + struct gf_flock *flock_use = NULL; int call_count = 0; int i = 0; int piggyback = 0; @@ -599,18 +708,14 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; priv = this->private; - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; - flock.l_type = F_UNLCK; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - - gf_log (this->name, GF_LOG_DEBUG, "attempting data unlock range %"PRIu64 - " %"PRIu64" by %"PRIu64, flock.l_start, flock.l_len, - frame->root->lk_owner); + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; + flock.l_type = F_UNLCK; full_flock.l_type = F_UNLCK; - - call_count = afr_locked_nodes_count (int_lock->inode_locked_nodes, + call_count = afr_locked_nodes_count (inodelk->locked_nodes, priv->child_count); int_lock->lk_call_count = call_count; @@ -626,60 +731,64 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) fd_ctx = afr_fd_ctx_get (local->fd, this); for (i = 0; i < priv->child_count; i++) { - if ((int_lock->inode_locked_nodes[i] & LOCKED_YES) - != LOCKED_YES) + if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES) continue; if (local->fd) { + flock_use = &flock; if (!local->transaction.eager_lock[i]) { goto wind; } piggyback = 0; - flock_use = &full_flock; LOCK (&local->fd->lock); { if (fd_ctx->lock_piggyback[i]) { fd_ctx->lock_piggyback[i]--; piggyback = 1; + } else { + fd_ctx->lock_acquired[i]--; } } UNLOCK (&local->fd->lock); if (piggyback) { afr_unlock_inodelk_cbk (frame, (void *) (long) i, - this, 1, 0); + this, 1, 0, NULL); if (!--call_count) break; continue; } - fd_ctx->lock_acquired[i]--; + flock_use = &full_flock; wind: - afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION, - AFR_UNLOCK_OP, &flock, F_SETLK, i); + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_TRANSACTION, + AFR_UNLOCK_OP, flock_use, F_SETLK, + i); STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, (void *) (long)i, priv->children[i], priv->children[i]->fops->finodelk, - this->name, local->fd, - F_SETLK, &flock); + int_lock->domain, local->fd, + F_SETLK, flock_use, NULL); if (!--call_count) break; } else { - afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION, + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_TRANSACTION, AFR_UNLOCK_OP, &flock, F_SETLK, i); STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, (void *) (long)i, priv->children[i], priv->children[i]->fops->inodelk, - this->name, &local->loc, - F_SETLK, &flock); + int_lock->domain, &local->loc, + F_SETLK, &flock, NULL); if (!--call_count) break; @@ -691,24 +800,34 @@ out: static int32_t afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; - int32_t child_index = (long)cookie; + afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; + int32_t child_index = 0; + int lockee_no = 0; + + priv = this->private; + lockee_no = (int)((long) cookie) / priv->child_count; + child_index = (int) ((long) cookie) % priv->child_count; local = frame->local; + int_lock = &local->internal_lock; - afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION, - AFR_UNLOCK_OP, NULL, op_ret, - op_errno, child_index); + AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, + AFR_UNLOCK_OP, + int_lock->lockee[lockee_no].basename, op_ret, + op_errno, (int) ((long)cookie)); - if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { + if (op_ret < 0) { gf_log (this->name, GF_LOG_ERROR, "%s: unlock failed on %d, reason: %s", local->loc.path, child_index, strerror (op_errno)); } - afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno); + int_lock->lockee[lockee_no].locked_nodes[child_index] &= LOCKED_NO; + afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, NULL); return 0; } @@ -716,24 +835,22 @@ afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, static int afr_unlock_entrylk (call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - const char *basename = NULL; - loc_t *loc = NULL; - int call_count = 0; - int i = -1; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int index = 0; + int lockee_no = 0; + int copies = 0; + int i = -1; local = frame->local; int_lock = &local->internal_lock; priv = this->private; + copies = priv->child_count; - basename = int_lock->lk_basename; - if (int_lock->lk_loc) - loc = int_lock->lk_loc; + call_count = afr_lockee_locked_nodes_count (int_lock); - call_count = afr_locked_nodes_count (int_lock->entry_locked_nodes, - priv->child_count); int_lock->lk_call_count = call_count; if (!call_count){ @@ -743,18 +860,23 @@ afr_unlock_entrylk (call_frame_t *frame, xlator_t *this) goto out; } - for (i = 0; i < priv->child_count; i++) { - if (int_lock->entry_locked_nodes[i] & LOCKED_YES) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION, - AFR_UNLOCK_OP, basename, i); + for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) { + lockee_no = i / copies; + index = i % copies; + if (int_lock->lockee[lockee_no].locked_nodes[index] & LOCKED_YES) { + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, + AFR_UNLOCK_OP, + int_lock->lockee[lockee_no].basename, + i); STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - loc, basename, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + priv->children[index], + priv->children[index]->fops->entrylk, + int_lock->domain, + &int_lock->lockee[lockee_no].loc, + int_lock->lockee[lockee_no].basename, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); if (!--call_count) break; @@ -768,15 +890,22 @@ out: static int32_t afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - int child_index = (long) cookie; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int cky = (long) cookie; + int child_index = 0; + int lockee_no = 0; + priv = this->private; local = frame->local; int_lock = &local->internal_lock; + child_index = ((int)cky) % priv->child_count; + lockee_no = ((int)cky) / priv->child_count; + LOCK (&frame->lock); { if (op_ret == -1) { @@ -789,10 +918,11 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int_lock->lock_op_ret = op_ret; } - local->child_up[child_index] = 0; local->op_errno = op_errno; int_lock->lock_op_errno = op_errno; } + + int_lock->lk_attempted_count++; } UNLOCK (&frame->lock); @@ -801,10 +931,17 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_unlock (frame, this); } else { if (op_ret == 0) { - int_lock->locked_nodes[child_index] |= LOCKED_YES; - int_lock->lock_count++; + if (local->transaction.type == AFR_ENTRY_TRANSACTION || + local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + int_lock->lockee[lockee_no].locked_nodes[child_index] |= LOCKED_YES; + int_lock->lockee[lockee_no].locked_count++; + int_lock->entrylk_lock_count++; + } else { + int_lock->locked_nodes[child_index] |= LOCKED_YES; + int_lock->lock_count++; + } } - afr_lock_blocking (frame, this, child_index + 1); + afr_lock_blocking (frame, this, cky + 1); } return 0; @@ -812,99 +949,26 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, static int32_t afr_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_trace_inodelk_out (frame, AFR_INODELK_TRANSACTION, + AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION, AFR_LOCK_OP, NULL, op_ret, op_errno, (long) cookie); - afr_lock_cbk (frame, cookie, this, op_ret, op_errno); + afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); return 0; } static int32_t -afr_lock_lower_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - loc_t *lower = NULL; - loc_t *higher = NULL; - const char *higher_name = NULL; - int child_index = (long) cookie; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ - - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/locks xlator on server"); - - local->op_ret = op_ret; - } - - local->child_up[child_index] = 0; - local->op_errno = op_errno; - } - } - UNLOCK (&frame->lock); - - if (op_ret != 0) { - afr_unlock (frame, this); - goto out; - } else { - int_lock->lower_locked_nodes[child_index] |= LOCKED_LOWER; - int_lock->lock_count++; - } - - /* The lower path has been locked. Now lock the higher path */ - - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, higher_name, child_index); - - - STACK_WIND_COOKIE (frame, afr_lock_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - this->name, higher, higher_name, - ENTRYLK_LOCK, ENTRYLK_WRLCK); - -out: - return 0; -} - -static int32_t afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION, + AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, AFR_LOCK_OP, NULL, op_ret, op_errno, (long)cookie); - afr_lock_cbk (frame, cookie, this, op_ret, op_errno); + afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); return 0; } @@ -912,6 +976,7 @@ static int afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; @@ -922,18 +987,16 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) switch (local->transaction.type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - memcpy (int_lock->inode_locked_nodes, - int_lock->locked_nodes, - priv->child_count); - int_lock->inodelk_lock_count = int_lock->lock_count; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + memcpy (inodelk->locked_nodes, int_lock->locked_nodes, + sizeof (*inodelk->locked_nodes) * priv->child_count); + inodelk->lock_count = int_lock->lock_count; break; case AFR_ENTRY_RENAME_TRANSACTION: case AFR_ENTRY_TRANSACTION: - memcpy (int_lock->entry_locked_nodes, - int_lock->locked_nodes, - priv->child_count); - int_lock->entrylk_lock_count = int_lock->lock_count; + /*entrylk_count is being used in both non-blocking and blocking + * modes */ break; } @@ -941,25 +1004,67 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) } +static inline gf_boolean_t +afr_is_entrylk (afr_internal_lock_t *int_lock, + afr_transaction_type trans_type) +{ + gf_boolean_t is_entrylk = _gf_false; + + if ((int_lock->transaction_lk_type == AFR_SELFHEAL_LK) && + int_lock->selfheal_lk_type == AFR_ENTRY_SELF_HEAL_LK) { + + is_entrylk = _gf_true; + + } else if ((int_lock->transaction_lk_type == AFR_TRANSACTION_LK) && + (trans_type == AFR_ENTRY_TRANSACTION || + trans_type == AFR_ENTRY_RENAME_TRANSACTION)) { + + is_entrylk = _gf_true; + + } else { + is_entrylk = _gf_false; + } + + return is_entrylk; +} + +static gf_boolean_t +_is_lock_wind_needed (afr_local_t *local, int child_index) +{ + if (!local->child_up[child_index]) + return _gf_false; + + return _gf_true; +} + int -afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) +afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; - loc_t *lower = NULL; - const char *lower_name = NULL; struct gf_flock flock = {0,}; uint64_t ctx = 0; int ret = 0; + int child_index = 0; + int lockee_no = 0; + gf_boolean_t is_entrylk = _gf_false; - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + child_index = cookie % priv->child_count; + lockee_no = cookie / priv->child_count; + is_entrylk = afr_is_entrylk (int_lock, local->transaction.type); - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; - flock.l_type = int_lock->lk_flock.l_type; + + if (!is_entrylk) { + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; + flock.l_type = inodelk->flock.l_type; + } if (local->fd) { ret = fd_ctx_get (local->fd, this, &ctx); @@ -978,42 +1083,26 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) return 0; } - - /* skip over children that or down - or don't have the fd open */ - - while ((child_index < priv->child_count) - && (!local->child_up[child_index] || - !local->fd_open_on[child_index])) - - child_index++; - } else { - /* skip over children that are down */ - while ((child_index < priv->child_count) - && !local->child_up[child_index]) - child_index++; } - if ((child_index == priv->child_count) && - int_lock->lock_count == 0) { - - gf_log (this->name, GF_LOG_INFO, - "unable to lock on even one child"); - - local->op_ret = -1; - int_lock->lock_op_ret = -1; + if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { + if ((is_entrylk && int_lock->entrylk_lock_count == 0) || + (!is_entrylk && int_lock->lock_count == 0)) { + gf_log (this->name, GF_LOG_INFO, + "unable to lock on even one child"); - afr_copy_locked_nodes (frame, this); + local->op_ret = -1; + int_lock->lock_op_ret = -1; - afr_unlock(frame, this); + afr_copy_locked_nodes (frame, this); - return 0; + afr_unlock(frame, this); + return 0; + } } - if ((child_index == priv->child_count) - || (int_lock->lock_count == int_lock->lk_expected_count)) { - + if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { /* we're done locking */ gf_log (this->name, GF_LOG_DEBUG, @@ -1026,12 +1115,18 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) return 0; } + if (!_is_lock_wind_needed (local, child_index)) { + afr_lock_blocking (frame, this, cookie + 1); + return 0; + } + switch (local->transaction.type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: if (local->fd) { - afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION, + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_TRANSACTION, AFR_LOCK_OP, &flock, F_SETLKW, child_index); @@ -1039,11 +1134,12 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->finodelk, - this->name, local->fd, - F_SETLKW, &flock); + int_lock->domain, local->fd, + F_SETLKW, &flock, NULL); } else { - afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION, + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_TRANSACTION, AFR_LOCK_OP, &flock, F_SETLKW, child_index); @@ -1051,63 +1147,44 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->inodelk, - this->name, &local->loc, - F_SETLKW, &flock); + int_lock->domain, &local->loc, + F_SETLKW, &flock, NULL); } break; case AFR_ENTRY_RENAME_TRANSACTION: - { - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, lower_name, child_index); - - - STACK_WIND_COOKIE (frame, afr_lock_lower_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - this->name, lower, lower_name, - ENTRYLK_LOCK, ENTRYLK_WRLCK); - - break; - } - case AFR_ENTRY_TRANSACTION: + /*Accounting for child_index increments on 'down' + *and 'fd-less' children */ + if (local->fd) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, local->transaction.basename, - child_index); + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, + cookie); STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, - (void *) (long) child_index, + (void *) (long) cookie, priv->children[child_index], priv->children[child_index]->fops->fentrylk, - this->name, local->fd, - local->transaction.basename, - ENTRYLK_LOCK, ENTRYLK_WRLCK); + int_lock->domain, local->fd, + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); } else { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION, + AFR_TRACE_ENTRYLK_IN (frame, this, + AFR_ENTRYLK_TRANSACTION, AFR_LOCK_OP, local->transaction.basename, child_index); STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, - (void *) (long) child_index, + (void *) (long) cookie, priv->children[child_index], priv->children[child_index]->fops->entrylk, - this->name, - &local->transaction.parent_loc, - local->transaction.basename, - ENTRYLK_LOCK, ENTRYLK_WRLCK); + int_lock->domain, + &int_lock->lockee[lockee_no].loc, + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); } break; @@ -1135,11 +1212,12 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this) break; case AFR_ENTRY_RENAME_TRANSACTION: + case AFR_ENTRY_TRANSACTION: up_count = afr_up_children_count (local->child_up, priv->child_count); - int_lock->lk_expected_count = 2 * up_count; - //fallthrough - case AFR_ENTRY_TRANSACTION: + int_lock->lk_call_count = int_lock->lk_expected_count + = (int_lock->lockee_count * + up_count); initialize_entrylk_variables (frame, this); break; } @@ -1151,48 +1229,60 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this) static int32_t afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; int call_count = 0; int child_index = (long) cookie; + int copies = 0; + int index = 0; + int lockee_no = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + copies = priv->child_count; + index = child_index % copies; + lockee_no = child_index / copies; local = frame->local; int_lock = &local->internal_lock; - afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, NULL, op_ret, + AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, op_ret, op_errno, (long) cookie); - LOCK (&frame->lock); - { - call_count = --int_lock->lk_call_count; - } - UNLOCK (&frame->lock); - - if (op_ret < 0 ) { - if (op_errno == ENOSYS) { + LOCK (&frame->lock); + { + if (op_ret < 0 ) { + if (op_errno == ENOSYS) { /* return ENOTSUP */ - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/locks xlator on server"); - local->op_ret = op_ret; - int_lock->lock_op_ret = op_ret; + gf_log (this->name, GF_LOG_ERROR, + "subvolume does not support locking. " + "please load features/locks xlator on server"); + local->op_ret = op_ret; + int_lock->lock_op_ret = op_ret; + + int_lock->lock_op_errno = op_errno; + local->op_errno = op_errno; + } + } else if (op_ret == 0) { + int_lock->lockee[lockee_no].locked_nodes[index] |= \ + LOCKED_YES; + int_lock->lockee[lockee_no].locked_count++; + int_lock->entrylk_lock_count++; + } - local->child_up[child_index] = 0; - int_lock->lock_op_errno = op_errno; - local->op_errno = op_errno; - } - } else if (op_ret == 0) { - int_lock->entry_locked_nodes[child_index] |= LOCKED_YES; - int_lock->entrylk_lock_count++; + call_count = --int_lock->lk_call_count; } + UNLOCK (&frame->lock); if (call_count == 0) { gf_log (this->name, GF_LOG_TRACE, "Last locking reply received"); - /* all locks successfull. Proceed to call FOP */ + /* all locks successful. Proceed to call FOP */ if (int_lock->entrylk_lock_count == int_lock->lk_expected_count) { gf_log (this->name, GF_LOG_TRACE, @@ -1200,7 +1290,7 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int_lock->lock_op_ret = 0; int_lock->lock_cbk (frame, this); } - /* Not all locks were successfull. Unlock and try locking + /* Not all locks were successful. Unlock and try locking again, this time with serially blocking locks */ else { gf_log (this->name, GF_LOG_TRACE, @@ -1214,42 +1304,26 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, return 0; } -void -afr_mark_fd_open_on (afr_local_t *local, afr_fd_ctx_t *fd_ctx, - size_t child_count) -{ - int i = 0; - - GF_ASSERT (local->fd_open_on); - - memset (local->fd_open_on, 0, sizeof (*local->fd_open_on)*child_count); - for (i = 0; i < child_count; i++) - if (fd_ctx->opened_on[i] == AFR_FD_OPENED) - local->fd_open_on[i] = 1; -} - int afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - const char *basename = NULL; - loc_t *loc = NULL; - int32_t call_count = 0; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int copies = 0; + int index = 0; + int lockee_no = 0; + int32_t call_count = 0; int i = 0; local = frame->local; int_lock = &local->internal_lock; priv = this->private; + copies = priv->child_count; initialize_entrylk_variables (frame, this); - basename = int_lock->lk_basename; - if (int_lock->lk_loc) - loc = int_lock->lk_loc; - if (local->fd) { fd_ctx = afr_fd_ctx_get (local->fd, this); if (!fd_ctx) { @@ -1262,11 +1336,11 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) local->op_errno = EINVAL; int_lock->lock_op_errno = EINVAL; + afr_unlock (frame, this); return -1; } - afr_mark_fd_open_on (local, fd_ctx, priv->child_count); - call_count = internal_lock_count (frame, this); + call_count = int_lock->lockee_count * internal_lock_count (frame, this); int_lock->lk_call_count = call_count; int_lock->lk_expected_count = call_count; @@ -1279,42 +1353,52 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) /* Send non-blocking entrylk calls only on up children and where the fd has been opened */ - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && local->fd_open_on[i]) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION, - AFR_LOCK_OP, basename, i); + for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) { + index = i%copies; + lockee_no = i/copies; + if (local->child_up[index]) { + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, + i); STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fentrylk, + priv->children[index], + priv->children[index]->fops->fentrylk, this->name, local->fd, - basename, - ENTRYLK_LOCK_NB, ENTRYLK_WRLCK); + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, + NULL); + if (!--call_count) + break; } } } else { - GF_ASSERT (loc); - - call_count = internal_lock_count (frame, this); + call_count = int_lock->lockee_count * internal_lock_count (frame, this); int_lock->lk_call_count = call_count; int_lock->lk_expected_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION, - AFR_LOCK_OP, basename, i); + for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) { + index = i%copies; + lockee_no = i/copies; + if (local->child_up[index]) { + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, + i); STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, loc, basename, - ENTRYLK_LOCK_NB, ENTRYLK_WRLCK); + priv->children[index], + priv->children[index]->fops->entrylk, + this->name, &int_lock->lockee[lockee_no].loc, + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, + NULL); if (!--call_count) break; - } } } @@ -1324,77 +1408,75 @@ out: int32_t afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; int call_count = 0; int child_index = (long) cookie; afr_fd_ctx_t *fd_ctx = NULL; - afr_private_t *priv = NULL; - priv = this->private; local = frame->local; int_lock = &local->internal_lock; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - afr_trace_inodelk_out (frame, AFR_INODELK_NB_TRANSACTION, + AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_NB_TRANSACTION, AFR_LOCK_OP, NULL, op_ret, op_errno, (long) cookie); + if (local->fd) + fd_ctx = afr_fd_ctx_get (local->fd, this); + LOCK (&frame->lock); { + if (op_ret < 0) { + if (op_errno == ENOSYS) { + /* return ENOTSUP */ + gf_log (this->name, GF_LOG_ERROR, + "subvolume does not support locking. " + "please load features/locks xlator on " + "server"); + local->op_ret = op_ret; + int_lock->lock_op_ret = op_ret; + int_lock->lock_op_errno = op_errno; + local->op_errno = op_errno; + } + if (local->transaction.eager_lock) + local->transaction.eager_lock[child_index] = 0; + } else { + inodelk->locked_nodes[child_index] |= LOCKED_YES; + inodelk->lock_count++; + + if (local->transaction.eager_lock && + local->transaction.eager_lock[child_index] && + local->fd) { + /* piggybacked */ + if (op_ret == 1) { + /* piggybacked */ + } else if (op_ret == 0) { + /* lock acquired from server */ + fd_ctx->lock_acquired[child_index]++; + } + } + } + call_count = --int_lock->lk_call_count; } UNLOCK (&frame->lock); - if (op_ret < 0) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/locks xlator on server"); - local->op_ret = op_ret; - int_lock->lock_op_ret = op_ret; - local->child_up[child_index] = 0; - int_lock->lock_op_errno = op_errno; - local->op_errno = op_errno; - } - } else { - int_lock->inode_locked_nodes[child_index] - |= LOCKED_YES; - int_lock->inodelk_lock_count++; - - if (priv->eager_lock && local->fd) { - fd_ctx = afr_fd_ctx_get (local->fd, this); - local->transaction.eager_lock[child_index] = 1; - /* piggybacked */ - - if (op_ret == 1) { - /* piggybacked */ - } else if (op_ret == 0) { - /* lock acquired from server */ - LOCK (&local->fd->lock); - { - fd_ctx->lock_acquired[child_index]++; - } - UNLOCK (&local->fd->lock); - } - } - } - if (call_count == 0) { gf_log (this->name, GF_LOG_TRACE, "Last inode locking reply received"); - /* all locks successfull. Proceed to call FOP */ - if (int_lock->inodelk_lock_count == - int_lock->lk_expected_count) { + /* all locks successful. Proceed to call FOP */ + if (inodelk->lock_count == int_lock->lk_expected_count) { gf_log (this->name, GF_LOG_TRACE, "All servers locked. Calling the cbk"); int_lock->lock_op_ret = 0; int_lock->lock_cbk (frame, this); } - /* Not all locks were successfull. Unlock and try locking + /* Not all locks were successful. Unlock and try locking again, this time with serially blocking locks */ else { gf_log (this->name, GF_LOG_TRACE, @@ -1412,30 +1494,29 @@ int afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; afr_fd_ctx_t *fd_ctx = NULL; - int32_t call_count = 0; - int i = 0; - int ret = 0; - struct gf_flock flock = {0,}; - struct gf_flock full_flock = {0,}; - struct gf_flock *flock_use = &flock; - int piggyback = 0; + int32_t call_count = 0; + int i = 0; + int ret = 0; + struct gf_flock flock = {0,}; + struct gf_flock full_flock = {0,}; + struct gf_flock *flock_use = NULL; + int piggyback = 0; local = frame->local; int_lock = &local->internal_lock; priv = this->private; - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; - flock.l_type = int_lock->lk_flock.l_type; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - gf_log (this->name, GF_LOG_DEBUG, "attempting data lock range %"PRIu64 - " %"PRIu64" by %"PRIu64, flock.l_start, flock.l_len, - frame->root->lk_owner); + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; + flock.l_type = inodelk->flock.l_type; - full_flock.l_type = int_lock->lk_flock.l_type; + full_flock.l_type = inodelk->flock.l_type; initialize_inodelk_variables (frame, this); @@ -1451,11 +1532,11 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) local->op_errno = EINVAL; int_lock->lock_op_errno = EINVAL; + afr_unlock (frame, this); ret = -1; goto out; } - afr_mark_fd_open_on (local, fd_ctx, priv->child_count); call_count = internal_lock_count (frame, this); int_lock->lk_call_count = call_count; int_lock->lk_expected_count = call_count; @@ -1470,14 +1551,18 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) /* Send non-blocking inodelk calls only on up children and where the fd has been opened */ for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i] || !local->fd_open_on[i]) + if (!local->child_up[i]) continue; - if (!priv->eager_lock) + flock_use = &flock; + if (!local->transaction.eager_lock_on) { goto wind; + } - flock_use = &full_flock; piggyback = 0; + local->transaction.eager_lock[i] = 1; + + afr_set_delayed_post_op (frame, this); LOCK (&local->fd->lock); { @@ -1491,21 +1576,23 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) if (piggyback) { /* (op_ret == 1) => indicate piggybacked lock */ afr_nonblocking_inodelk_cbk (frame, (void *) (long) i, - this, 1, 0); + this, 1, 0, NULL); if (!--call_count) break; continue; } + flock_use = &full_flock; wind: - afr_trace_inodelk_in (frame, AFR_INODELK_NB_TRANSACTION, + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_NB_TRANSACTION, AFR_LOCK_OP, flock_use, F_SETLK, i); STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->finodelk, - this->name, local->fd, - F_SETLK, flock_use); + int_lock->domain, local->fd, + F_SETLK, flock_use, NULL); if (!--call_count) break; @@ -1518,15 +1605,16 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) for (i = 0; i < priv->child_count; i++) { if (!local->child_up[i]) continue; - afr_trace_inodelk_in (frame, AFR_INODELK_NB_TRANSACTION, + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_NB_TRANSACTION, AFR_LOCK_OP, &flock, F_SETLK, i); STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->inodelk, - this->name, &local->loc, - F_SETLK, &flock); + int_lock->domain, &local->loc, + F_SETLK, &flock, NULL); if (!--call_count) break; @@ -1536,200 +1624,6 @@ out: return ret; } -static int -__is_lower_locked (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int count = 0; - int i = 0; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (int_lock->lower_locked_nodes[i] & LOCKED_LOWER) - count++; - } - - return count; - -} - -static int -__is_higher_locked (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int count = 0; - int i = 0; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (int_lock->locked_nodes[i] & LOCKED_YES) - count++; - } - - return count; - -} - -static int -afr_unlock_lower_entrylk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - const char *basename = NULL; - loc_t *loc = NULL; - int call_count = 0; - int i = -1; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - basename = int_lock->lk_basename; - if (int_lock->lk_loc) - loc = int_lock->lk_loc; - - call_count = __is_lower_locked (frame, this); - int_lock->lk_call_count = call_count; - - if (!call_count){ - gf_log (this->name, GF_LOG_TRACE, - "No internal locks unlocked"); - int_lock->lock_cbk (frame, this); - goto out; - } - - for (i = 0; i < priv->child_count; i++) { - if (int_lock->lower_locked_nodes[i] & LOCKED_LOWER) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION, - AFR_UNLOCK_OP, basename, i); - - STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - loc, basename, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK); - - if (!--call_count) - break; - - } - } - -out: - return 0; - -} - - -static int -afr_post_unlock_higher_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.done (frame, this); - return 0; -} - -static int -afr_post_unlock_lower_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - loc_t *lower = NULL; - loc_t *higher = NULL; - const char *higher_name = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - if (__is_higher_locked (frame, this)) { - gf_log (this->name, GF_LOG_DEBUG, - "unlocking higher"); - int_lock->lk_basename = higher_name; - int_lock->lk_loc = higher; - int_lock->lock_cbk = afr_post_unlock_higher_cbk; - - afr_unlock_entrylk (frame, this); - } else - local->transaction.done (frame, this); - - return 0; -} - -static int -afr_rename_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - loc_t *lower = NULL; - const char *lower_name = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - if (__is_lower_locked (frame, this)) { - gf_log (this->name, GF_LOG_DEBUG, - "unlocking lower"); - int_lock->lk_basename = lower_name; - int_lock->lk_loc = lower; - int_lock->lock_cbk = afr_post_unlock_lower_cbk; - - afr_unlock_lower_entrylk (frame, this); - } else - afr_post_unlock_lower_cbk (frame, this); - - return 0; -} - -static int -afr_rename_transaction (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - return (local->transaction.type == - AFR_ENTRY_RENAME_TRANSACTION); - -} - int32_t afr_unlock (call_frame_t *frame, xlator_t *this) { @@ -1741,10 +1635,8 @@ afr_unlock (call_frame_t *frame, xlator_t *this) if (is_afr_lock_transaction (local)) afr_unlock_inodelk (frame, this); else - if (!afr_rename_transaction (frame, this)) - afr_unlock_entrylk (frame, this); - else - afr_rename_unlock (frame, this); + afr_unlock_entrylk (frame, this); + } else { if (is_afr_lock_selfheal (local)) afr_unlock_inodelk (frame, this); @@ -1913,10 +1805,12 @@ out: int32_t afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock); + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata); int32_t afr_recover_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) { afr_local_t *local = NULL; afr_private_t *priv = NULL; @@ -1940,7 +1834,7 @@ afr_recover_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, (void *) (long) source_child, priv->children[source_child], priv->children[source_child]->fops->lk, - local->fd, F_GETLK_FD, &flock); + local->fd, F_GETLK_FD, &flock, NULL); return 0; @@ -1968,7 +1862,7 @@ afr_recover_lock (call_frame_t *frame, xlator_t *this, (void *) (long) lock_recovery_child, priv->children[lock_recovery_child], priv->children[lock_recovery_child]->fops->lk, - local->fd, F_SETLK, flock); + local->fd, F_SETLK, flock, NULL); return 0; } @@ -1986,7 +1880,8 @@ is_afr_lock_eol (struct gf_flock *lock) int32_t afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) { if (op_ret) { gf_log (this->name, GF_LOG_INFO, @@ -2046,7 +1941,7 @@ afr_lock_recovery (call_frame_t *frame, xlator_t *this) (void *) (long) source_child, priv->children[source_child], priv->children[source_child]->fops->lk, - local->fd, F_GETLK_FD, &flock); + local->fd, F_GETLK_FD, &flock, NULL); out: return ret; @@ -2074,7 +1969,8 @@ out: int32_t afr_lock_recovery_preopen_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata) { int32_t child_index = (long )cookie; int ret = 0; @@ -2126,7 +2022,12 @@ afr_lock_recovery_preopen (call_frame_t *frame, xlator_t *this) GF_ASSERT (local && local->fd); ret = fd_ctx_get (local->fd, this, &tmp); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to get the context of fd", + uuid_utoa (local->fd->inode->gfid)); fdctx = (afr_fd_ctx_t *) (long) tmp; + /* TODO: instead we should return from the function */ GF_ASSERT (fdctx); child_index = local->lock_recovery_child; @@ -2141,8 +2042,7 @@ afr_lock_recovery_preopen (call_frame_t *frame, xlator_t *this) (void *)(long) child_index, priv->children[child_index], priv->children[child_index]->fops->open, - &loc, fdctx->flags, local->fd, - fdctx->wbflags); + &loc, fdctx->flags, local->fd, NULL); return 0; } @@ -2170,13 +2070,14 @@ out: int afr_attempt_lock_recovery (xlator_t *this, int32_t child_index) { - call_frame_t *frame = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_locked_fd_t *locked_fd = NULL; + call_frame_t *frame = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_locked_fd_t *locked_fd = NULL; afr_locked_fd_t *tmp = NULL; - int ret = 0; + int ret = -1; struct list_head locks_list = {0,}; + int32_t op_errno = 0; priv = this->private; @@ -2190,15 +2091,10 @@ afr_attempt_lock_recovery (xlator_t *this, int32_t child_index) goto out; } - local = GF_CALLOC (1, sizeof (*local), - gf_afr_mt_afr_local_t); - if (!local) { - ret = -1; - goto out; - } - - AFR_LOCAL_INIT (local, priv); - if (!local) { + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) { ret = -1; goto out; } @@ -2236,5 +2132,43 @@ afr_attempt_lock_recovery (xlator_t *this, int32_t child_index) } out: + if ((ret < 0) && frame) + AFR_STACK_DESTROY (frame); + return ret; +} + +int +afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, + unsigned int child_count) +{ + afr_local_t *dst_local = NULL; + afr_local_t *src_local = NULL; + afr_internal_lock_t *dst_lock = NULL; + afr_internal_lock_t *src_lock = NULL; + afr_inodelk_t *dst_inodelk = NULL; + afr_inodelk_t *src_inodelk = NULL; + int ret = -1; + + src_local = src->local; + src_lock = &src_local->internal_lock; + src_inodelk = afr_get_inodelk (src_lock, dom); + dst_local = dst->local; + dst_lock = &dst_local->internal_lock; + dst_inodelk = afr_get_inodelk (dst_lock, dom); + if (!dst_inodelk || !src_inodelk) + goto out; + if (src_inodelk->locked_nodes) { + memcpy (dst_inodelk->locked_nodes, src_inodelk->locked_nodes, + sizeof (*dst_inodelk->locked_nodes) * child_count); + memset (src_inodelk->locked_nodes, 0, + sizeof (*src_inodelk->locked_nodes) * child_count); + } + + dst_lock->transaction_lk_type = src_lock->transaction_lk_type; + dst_lock->selfheal_lk_type = src_lock->selfheal_lk_type; + dst_inodelk->lock_count = src_inodelk->lock_count; + src_inodelk->lock_count = 0; + ret = 0; +out: return ret; } diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index ebe189c35..73594f265 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -26,7 +17,6 @@ enum gf_afr_mem_types_ { gf_afr_mt_iovec = gf_common_mt_end + 1, gf_afr_mt_afr_fd_ctx_t, - gf_afr_mt_afr_local_t, gf_afr_mt_afr_private_t, gf_afr_mt_int32_t, gf_afr_mt_char, @@ -44,8 +34,17 @@ enum gf_afr_mem_types_ { gf_afr_mt_locked_fd, gf_afr_mt_inode_ctx_t, gf_afr_fd_paused_call_t, - gf_afr_mt_afr_crawl_data_t, - gf_afr_mt_afr_brick_pos_t, + gf_afr_mt_crawl_data_t, + gf_afr_mt_brick_pos_t, + gf_afr_mt_shd_bool_t, + gf_afr_mt_shd_timer_t, + gf_afr_mt_shd_event_t, + gf_afr_mt_time_t, + gf_afr_mt_pos_data_t, + gf_afr_mt_reply_t, + gf_afr_mt_stats_t, + gf_afr_mt_shd_crawl_event_t, + gf_afr_mt_uint64_t, gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index 4b328e184..643a5d692 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -92,14 +83,12 @@ afr_perform_data_self_heal (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; inode_t *inode = NULL; int st_child = -1; char reason[64] = {0}; local = frame->local; sh = &local->self_heal; - priv = this->private; inode = local->fd->inode; if (!IA_ISREG (inode->ia_type)) @@ -125,7 +114,7 @@ out: int afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = frame->local; afr_private_t *priv = NULL; @@ -134,7 +123,7 @@ afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (afr_open_only_data_self_heal (priv->data_self_heal)) afr_perform_data_self_heal (frame, this); AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, - local->fd); + local->fd, xdata); return 0; } @@ -142,11 +131,9 @@ afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int afr_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd) + fd_t *fd, dict_t *xdata) { afr_local_t * local = NULL; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; int ret = 0; int call_count = -1; int child_index = (long) cookie; @@ -165,32 +152,13 @@ afr_open_cbk (call_frame_t *frame, void *cookie, local->op_ret = op_ret; local->success_count++; - ret = afr_fd_ctx_set (this, fd); - - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not set fd ctx for fd=%p", fd); - - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } - - ret = fd_ctx_get (fd, this, &ctx); - - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not get fd ctx for fd=%p", fd); - local->op_ret = -1; + ret = afr_child_fd_ctx_set (this, fd, child_index, + local->cont.open.flags); + if (ret) { + local->op_ret = -1; local->op_errno = -ret; goto unlock; } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - fd_ctx->opened_on[child_index] = AFR_FD_OPENED; - fd_ctx->flags = local->cont.open.flags; - fd_ctx->wbflags = local->cont.open.wbflags; } } unlock: @@ -203,12 +171,12 @@ unlock: && (local->op_ret >= 0)) { STACK_WIND (frame, afr_open_ftruncate_cbk, this, this->fops->ftruncate, - fd, 0); + fd, 0, NULL); } else { if (afr_open_only_data_self_heal (priv->data_self_heal)) afr_perform_data_self_heal (frame, this); AFR_STACK_UNWIND (open, frame, local->op_ret, - local->op_errno, local->fd); + local->op_errno, local->fd, xdata); } } @@ -217,14 +185,13 @@ unlock: int afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, int32_t wbflags) + fd_t *fd, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; int i = 0; int ret = -1; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; int32_t wind_flags = flags & (~O_TRUNC); //We can't let truncation to happen outside transaction. @@ -236,6 +203,10 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, priv = this->private; + if (flags & (O_CREAT|O_TRUNC)) { + QUORUM_CHECK(open,out); + } + if (afr_is_split_brain (this, loc->inode)) { /* self-heal failed */ gf_log (this->name, GF_LOG_WARNING, @@ -244,20 +215,17 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - frame->local = local; call_count = local->call_count; loc_copy (&local->loc, loc); local->cont.open.flags = flags; - local->cont.open.wbflags = wbflags; local->fd = fd_ref (fd); @@ -266,86 +234,45 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->open, - loc, wind_flags, fd, wbflags); + loc, wind_flags, fd, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (open, frame, op_ret, op_errno, fd); - } + if (ret < 0) + AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, xdata); return 0; } -//NOTE: this function should be called with holding the lock on -//fd to which fd_ctx belongs -void -afr_get_resumable_calls (xlator_t *this, afr_fd_ctx_t *fd_ctx, - struct list_head *list) -{ - afr_fd_paused_call_t *paused_call = NULL; - afr_fd_paused_call_t *tmp = NULL; - afr_local_t *call_local = NULL; - afr_private_t *priv = NULL; - int i = 0; - gf_boolean_t call = _gf_false; - - priv = this->private; - list_for_each_entry_safe (paused_call, tmp, &fd_ctx->paused_calls, - call_list) { - call = _gf_true; - call_local = paused_call->frame->local; - for (i = 0; i < priv->child_count; i++) { - if (call_local->child_up[i] && - (fd_ctx->opened_on[i] == AFR_FD_OPENING)) - call = _gf_false; - } - - if (call) { - list_del_init (&paused_call->call_list); - list_add (&paused_call->call_list, list); - } - } -} - -void -afr_resume_calls (xlator_t *this, struct list_head *list) -{ - afr_fd_paused_call_t *paused_call = NULL; - afr_fd_paused_call_t *tmp = NULL; - afr_local_t *call_local = NULL; - - list_for_each_entry_safe (paused_call, tmp, list, call_list) { - list_del_init (&paused_call->call_list); - call_local = paused_call->frame->local; - call_local->fop_call_continue (paused_call->frame, this); - GF_FREE (paused_call); - } -} - int afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - int call_count = 0; - int child_index = (long) cookie; - struct list_head paused_calls = {0}; - gf_boolean_t fop_paused = _gf_false; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int call_count = 0; + int child_index = (long) cookie; priv = this->private; local = frame->local; - call_count = afr_frame_return (frame); + if (op_ret >= 0) { + gf_log (this->name, GF_LOG_DEBUG, "fd for %s opened " + "successfully on subvolume %s", local->loc.path, + priv->children[child_index]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, "Failed to open %s " + "on subvolume %s", local->loc.path, + priv->children[child_index]->name); + } - //Note: No frame locking needed for this block of code fd_ctx = afr_fd_ctx_get (local->fd, this); if (!fd_ctx) { gf_log (this->name, GF_LOG_WARNING, @@ -353,102 +280,103 @@ afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - fop_paused = local->fop_paused; LOCK (&local->fd->lock); { if (op_ret >= 0) { fd_ctx->opened_on[child_index] = AFR_FD_OPENED; - gf_log (this->name, GF_LOG_INFO, "fd for %s opened " - "successfully on subvolume %s", local->loc.path, - priv->children[child_index]->name); } else { - //Change open status from OPENING to NOT OPENED. fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; } - if (call_count == 0) { - INIT_LIST_HEAD (&paused_calls); - afr_get_resumable_calls (this, fd_ctx, &paused_calls); - } } UNLOCK (&local->fd->lock); out: - if (call_count == 0) { - afr_resume_calls (this, &paused_calls); - //If the fop is paused then resume_calls will continue the fop - if (fop_paused) - goto done; - - if (local->fop_call_continue) - local->fop_call_continue (frame, this); - else - AFR_STACK_DESTROY (frame); - } + call_count = afr_frame_return (frame); + if (call_count == 0) + AFR_STACK_DESTROY (frame); -done: return 0; } -int -afr_fix_open (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx, - int need_open_count, int *need_open) +void +afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int i = 0; - call_frame_t *open_frame = NULL; - afr_local_t *open_local = NULL; - int ret = -1; - int32_t op_errno = 0; - - GF_ASSERT (fd_ctx); - GF_ASSERT (need_open_count > 0); - GF_ASSERT (need_open); + afr_private_t *priv = NULL; + int i = 0; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + int ret = -1; + int32_t op_errno = 0; + afr_fd_ctx_t *fd_ctx = NULL; - local = frame->local; priv = this->private; - if (!local->fop_call_continue) { - open_frame = copy_frame (frame); - if (!open_frame) { - ret = -ENOMEM; - goto out; - } - ALLOC_OR_GOTO (open_local, afr_local_t, out); - open_frame->local = open_local; - ret = AFR_LOCAL_INIT (open_local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - loc_copy (&open_local->loc, &local->loc); - open_local->fd = fd_ref (local->fd); - } else { - ret = 0; - open_frame = frame; - open_local = local; + + if (!afr_is_fd_fixable (fd) || !need_open || !need_open_count) + goto out; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) { + ret = -1; + goto out; } - open_local->call_count = need_open_count; + frame = create_frame (this, this->ctx->pool); + if (!frame) { + ret = -1; + goto out; + } + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + local->loc.inode = inode_ref (fd->inode); + ret = loc_path (&local->loc, NULL); + if (ret < 0) + goto out; + + local->fd = fd_ref (fd); + local->call_count = need_open_count; - gf_log (this->name, GF_LOG_DEBUG, "need open count: %d", + gf_log (this->name, GF_LOG_DEBUG, "need open count: %zd", need_open_count); for (i = 0; i < priv->child_count; i++) { - if (need_open[i]) { + if (!need_open[i]) + continue; + + if (IA_IFDIR == fd->inode->ia_type) { gf_log (this->name, GF_LOG_DEBUG, - "opening fd for %s on subvolume %s", + "opening fd for dir %s on subvolume %s", local->loc.path, priv->children[i]->name); - STACK_WIND_COOKIE (open_frame, afr_openfd_fix_open_cbk, + STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk, + (void*) (long) i, + priv->children[i], + priv->children[i]->fops->opendir, + &local->loc, local->fd, + NULL); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "opening fd for file %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk, (void *)(long) i, priv->children[i], priv->children[i]->fops->open, - &open_local->loc, fd_ctx->flags, - open_local->fd, fd_ctx->wbflags); - + &local->loc, + fd_ctx->flags & (~O_TRUNC), + local->fd, NULL); } + } + op_errno = 0; + ret = 0; out: - if (ret && open_frame) - AFR_STACK_DESTROY (open_frame); - return ret; + if (op_errno) + ret = -1; //For handling ALLOC_OR_GOTO + if (ret && frame) + AFR_STACK_DESTROY (frame); } diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c index 48399b5e9..83846f152 100644 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c +++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.c @@ -1,23 +1,15 @@ /* - Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ +#include <openssl/md5.h> #include "glusterfs.h" #include "afr.h" #include "xlator.h" @@ -33,7 +25,6 @@ #include "compat-errno.h" #include "compat.h" #include "byte-order.h" -#include "md5.h" #include "afr-transaction.h" #include "afr-self-heal.h" @@ -72,8 +63,7 @@ sh_private_cleanup (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; sh_priv = sh->private; - if (sh_priv) - GF_FREE (sh_priv); + GF_FREE (sh_priv); } static int @@ -104,14 +94,16 @@ sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this, local = sh_frame->local; sh = &local->self_heal; sh_priv = sh->private; - total_blocks = sh_priv->total_blocks; - diff_blocks = sh_priv->diff_blocks; + if (sh_priv) { + total_blocks = sh_priv->total_blocks; + diff_blocks = sh_priv->diff_blocks; + } sh_private_cleanup (sh_frame, this); - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { GF_ASSERT (!last_loop_frame); //loop_finish should have happened and the old_loop should be NULL - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_DEBUG, "self-heal aborting on %s", local->loc.path); @@ -119,20 +111,17 @@ sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this, } else { GF_ASSERT (last_loop_frame); if (diff_blocks == total_blocks) { - gf_log (this->name, GF_LOG_INFO, "full self-heal " + gf_log (this->name, GF_LOG_DEBUG, "full self-heal " "completed on %s",local->loc.path); } else { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_DEBUG, "diff self-heal on %s: completed. " "(%d blocks of %d were different (%.2f%%))", local->loc.path, diff_blocks, total_blocks, ((diff_blocks * 1.0)/total_blocks) * 100); } - if (sh_frame == last_loop_frame) - sh->old_loop_frame = NULL; - else - sh->old_loop_frame = last_loop_frame; + sh->old_loop_frame = last_loop_frame; local->self_heal.algo_completion_cbk (sh_frame, this); } @@ -153,17 +142,10 @@ sh_loop_finish (call_frame_t *loop_frame, xlator_t *this) loop_sh = &loop_local->self_heal; } - if (loop_sh && loop_sh->loop_completion_cbk) { - if (loop_sh->data_lock_held) { - afr_sh_data_unlock (loop_frame, this, - loop_sh->loop_completion_cbk); - } else { - loop_sh->loop_completion_cbk (loop_frame, this); - } + if (loop_sh && loop_sh->data_lock_held) { + afr_sh_data_unlock (loop_frame, this, this->name, + sh_destroy_frame); } else { - //default loop_completion_cbk destroys the loop_frame - if (loop_sh && !loop_sh->loop_completion_cbk) - GF_ASSERT (!loop_sh->data_lock_held); sh_destroy_frame (loop_frame, this); } out: @@ -182,7 +164,7 @@ sh_loop_lock_success (call_frame_t *loop_frame, xlator_t *this) sh_loop_finish (loop_sh->old_loop_frame, this); loop_sh->old_loop_frame = NULL; - gf_log (this->name, GF_LOG_DEBUG, "Aquired lock for range %"PRIu64 + gf_log (this->name, GF_LOG_DEBUG, "Acquired lock for range %"PRIu64 " %"PRIu64, loop_sh->offset, loop_sh->block_size); loop_sh->data_lock_held = _gf_true; loop_sh->sh_data_algo_start (loop_frame, this); @@ -209,8 +191,8 @@ sh_loop_lock_failure (call_frame_t *loop_frame, xlator_t *this) } static int -sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, - call_frame_t *old_loop_frame) +sh_loop_frame_create (call_frame_t *sh_frame, xlator_t *this, + call_frame_t *old_loop_frame, call_frame_t **loop_frame) { call_frame_t *new_loop_frame = NULL; afr_local_t *local = NULL; @@ -220,7 +202,9 @@ sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, afr_private_t *priv = NULL; GF_ASSERT (sh_frame); + GF_ASSERT (loop_frame); + *loop_frame = NULL; local = sh_frame->local; sh = &local->self_heal; priv = this->private; @@ -228,8 +212,9 @@ sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, new_loop_frame = copy_frame (sh_frame); if (!new_loop_frame) goto out; - //We want the frame to have same lk_oner as sh_frame - new_loop_local = afr_local_copy (local, this); + //We want the frame to have same lk_owner as sh_frame + //so that locks translator allows conflicting locks + new_loop_local = afr_self_heal_local_init (local, this); if (!new_loop_local) goto out; new_loop_frame->local = new_loop_local; @@ -244,29 +229,54 @@ sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, gf_afr_mt_char); if (!new_loop_sh->write_needed) goto out; - new_loop_sh->checksum = GF_CALLOC (priv->child_count, MD5_DIGEST_LEN, + new_loop_sh->checksum = GF_CALLOC (priv->child_count, MD5_DIGEST_LENGTH, gf_afr_mt_uint8_t); if (!new_loop_sh->checksum) goto out; - new_loop_sh->offset = offset; - new_loop_sh->block_size = sh->block_size; - new_loop_sh->old_loop_frame = old_loop_frame; - new_loop_sh->sh_frame = sh_frame; new_loop_sh->inode = inode_ref (sh->inode); new_loop_sh->sh_data_algo_start = sh->sh_data_algo_start; new_loop_sh->source = sh->source; new_loop_sh->active_sinks = sh->active_sinks; new_loop_sh->healing_fd = fd_ref (sh->healing_fd); new_loop_sh->file_has_holes = sh->file_has_holes; - new_loop_sh->loop_completion_cbk = sh_destroy_frame; + new_loop_sh->old_loop_frame = old_loop_frame; + new_loop_sh->sh_frame = sh_frame; + *loop_frame = new_loop_frame; + return 0; +out: + sh_destroy_frame (new_loop_frame, this); + return -ENOMEM; +} + +static int +sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, + call_frame_t *old_loop_frame) +{ + call_frame_t *new_loop_frame = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_local_t *new_loop_local = NULL; + afr_self_heal_t *new_loop_sh = NULL; + int ret = 0; + + GF_ASSERT (sh_frame); + + local = sh_frame->local; + sh = &local->self_heal; + + ret = sh_loop_frame_create (sh_frame, this, old_loop_frame, + &new_loop_frame); + if (ret) + goto out; + new_loop_local = new_loop_frame->local; + new_loop_sh = &new_loop_local->self_heal; + new_loop_sh->offset = offset; + new_loop_sh->block_size = sh->block_size; afr_sh_data_lock (new_loop_frame, this, offset, new_loop_sh->block_size, - sh_loop_lock_success, sh_loop_lock_failure); + _gf_true, this->name, sh_loop_lock_success, sh_loop_lock_failure); return 0; out: - sh->op_failed = 1; - if (new_loop_frame) { - new_loop_frame->local = new_loop_local; - } + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); if (old_loop_frame) sh_loop_finish (old_loop_frame, this); sh_loop_return (sh_frame, this, new_loop_frame, -1, ENOMEM); @@ -277,7 +287,6 @@ static int sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, gf_boolean_t is_first_call, call_frame_t *old_loop_frame) { - afr_private_t * priv = NULL; afr_local_t * local = NULL; afr_self_heal_t * sh = NULL; afr_sh_algo_private_t *sh_priv = NULL; @@ -285,6 +294,7 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, blksize_t block_size = 0; int loop = 0; off_t offset = 0; + afr_private_t *priv = NULL; priv = this->private; local = sh_frame->local; @@ -293,19 +303,20 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, LOCK (&sh_priv->lock); { - if (_gf_false == is_first_call) + if (!is_first_call) sh_priv->loops_running--; offset = sh_priv->offset; block_size = sh->block_size; - while ((!sh->eof_reached) && (0 == sh->op_failed) && - (sh_priv->loops_running < priv->data_self_heal_window_size) + while ((!sh->eof_reached) && + (!is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) && + (sh_priv->loops_running < priv->data_self_heal_window_size) && (sh_priv->offset < sh->file_size)) { loop++; sh_priv->offset += block_size; sh_priv->loops_running++; - if (_gf_false == is_first_call) + if (!is_first_call) break; } if (0 == sh_priv->loops_running) { @@ -317,7 +328,8 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, if (0 == loop) { //loop finish does unlock, but the erasing of the pending //xattrs needs to happen before that so do not finish the loop - if (is_driver_done && !sh->op_failed) + if (is_driver_done && + !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) goto driver_done; if (old_loop_frame) { sh_loop_finish (old_loop_frame, this); @@ -328,7 +340,7 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, //If we have more loops to form we should finish previous loop after //the next loop lock while (loop--) { - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { // op failed in other loop, stop spawning more loops if (old_loop_frame) { sh_loop_finish (old_loop_frame, this); @@ -374,7 +386,7 @@ sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame } if (op_ret == -1) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); if (loop_frame) { sh_loop_finish (loop_frame, this); @@ -390,7 +402,7 @@ sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame static int sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * loop_local = NULL; @@ -422,13 +434,22 @@ sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (loop_sh, op_errno); + } else if (op_ret < loop_local->cont.writev.vector->iov_len) { + gf_log (this->name, GF_LOG_ERROR, + "incomplete write to %s on subvolume %s " + "(expected %lu, returned %d)", sh_local->loc.path, + priv->children[child_index]->name, + loop_local->cont.writev.vector->iov_len, op_ret); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } call_count = afr_frame_return (loop_frame); if (call_count == 0) { + iobref_unref(loop_local->cont.writev.iobref); + sh_loop_return (sh_frame, this, loop_frame, loop_sh->op_ret, loop_sh->op_errno); } @@ -436,12 +457,41 @@ sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, return 0; } +static void +sh_prune_writes_needed (call_frame_t *sh_frame, call_frame_t *loop_frame, + afr_private_t *priv) +{ + afr_local_t *sh_local = NULL; + afr_self_heal_t *sh = NULL; + afr_local_t *loop_local = NULL; + afr_self_heal_t *loop_sh = NULL; + int i = 0; + + sh_local = sh_frame->local; + sh = &sh_local->self_heal; + + if (!strcmp (sh->algo->name, "diff")) + return; + + loop_local = loop_frame->local; + loop_sh = &loop_local->self_heal; + + /* full self-heal guarantees there exists atleast 1 file with size 0 + * That means for other files we can preserve holes that come after + * its size before 'trim' + */ + for (i = 0; i < priv->child_count; i++) { + if (loop_sh->write_needed[i] && + ((loop_sh->offset + 1) > sh->buf[i].ia_size)) + loop_sh->write_needed[i] = 0; + } +} static int sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, int32_t count, struct iatt *buf, - struct iobref *iobref) + struct iobref *iobref, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * loop_local = NULL; @@ -466,7 +516,7 @@ sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, if (op_ret <= 0) { if (op_ret < 0) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); gf_log (this->name, GF_LOG_ERROR, "read failed on %d " "for %s reason :%s", sh->source, sh_local->loc.path, strerror (errno)); @@ -479,18 +529,26 @@ sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, goto out; } - if (loop_sh->file_has_holes && iov_0filled (vector, count) == 0) { - gf_log (this->name, GF_LOG_DEBUG, "0 filled block"); - sh_loop_return (sh_frame, this, loop_frame, - op_ret, op_errno); - goto out; - } + if (loop_sh->file_has_holes && iov_0filled (vector, count) == 0) + sh_prune_writes_needed (sh_frame, loop_frame, priv); call_count = sh_number_of_writes_needed (loop_sh->write_needed, priv->child_count); - GF_ASSERT (call_count > 0); + if (call_count == 0) { + sh_loop_return (sh_frame, this, loop_frame, 0, 0); + goto out; + } + loop_local->call_count = call_count; + /* + * We only really need the request size at the moment, but the buffer + * is required if we want to issue a retry in the event of a short write. + * Therefore, we duplicate the vector and ref the iobref here... + */ + loop_local->cont.writev.vector = iov_dup(vector, count); + loop_local->cont.writev.iobref = iobref_ref(iobref); + for (i = 0; i < priv->child_count; i++) { if (!loop_sh->write_needed[i]) continue; @@ -499,7 +557,7 @@ sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, priv->children[i], priv->children[i]->fops->writev, loop_sh->healing_fd, vector, count, - loop_sh->offset, iobref); + loop_sh->offset, 0, iobref, NULL); if (!--call_count) break; @@ -526,7 +584,7 @@ sh_loop_read (call_frame_t *loop_frame, xlator_t *this) priv->children[loop_sh->source], priv->children[loop_sh->source]->fops->readv, loop_sh->healing_fd, loop_sh->block_size, - loop_sh->offset); + loop_sh->offset, 0, NULL); return 0; } @@ -535,7 +593,8 @@ sh_loop_read (call_frame_t *loop_frame, xlator_t *this) static int sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - uint32_t weak_checksum, uint8_t *strong_checksum) + uint32_t weak_checksum, uint8_t *strong_checksum, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *loop_local = NULL; @@ -567,10 +626,10 @@ sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, "checksum on %s failed on subvolume %s (%s)", sh_local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } else { - memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LEN, - strong_checksum, MD5_DIGEST_LEN); + memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LENGTH, + strong_checksum, MD5_DIGEST_LENGTH); } call_count = afr_frame_return (loop_frame); @@ -580,9 +639,9 @@ sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, if (sh->sources[i] || !sh_local->child_up[i]) continue; - if (memcmp (loop_sh->checksum + (i * MD5_DIGEST_LEN), - loop_sh->checksum + (sh->source * MD5_DIGEST_LEN), - MD5_DIGEST_LEN)) { + if (memcmp (loop_sh->checksum + (i * MD5_DIGEST_LENGTH), + loop_sh->checksum + (sh->source * MD5_DIGEST_LENGTH), + MD5_DIGEST_LENGTH)) { /* Checksums differ, so this block must be written to this sink @@ -605,7 +664,8 @@ sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, } UNLOCK (&sh_priv->lock); - if (write_needed && !sh->op_failed) { + if (write_needed && + !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { sh_loop_read (loop_frame, this); } else { sh_loop_return (sh_frame, this, loop_frame, @@ -638,7 +698,7 @@ sh_diff_checksum (call_frame_t *loop_frame, xlator_t *this) priv->children[loop_sh->source], priv->children[loop_sh->source]->fops->rchecksum, loop_sh->healing_fd, - loop_sh->offset, loop_sh->block_size); + loop_sh->offset, loop_sh->block_size, NULL); for (i = 0; i < priv->child_count; i++) { if (loop_sh->sources[i] || !loop_local->child_up[i]) @@ -649,7 +709,7 @@ sh_diff_checksum (call_frame_t *loop_frame, xlator_t *this) priv->children[i], priv->children[i]->fops->rchecksum, loop_sh->healing_fd, - loop_sh->offset, loop_sh->block_size); + loop_sh->offset, loop_sh->block_size, NULL); if (!--call_count) break; @@ -679,38 +739,80 @@ sh_full_read_write_to_sinks (call_frame_t *loop_frame, xlator_t *this) return 0; } -static int -sh_do_nothing (call_frame_t *frame, xlator_t *this) +afr_sh_algo_private_t* +afr_sh_priv_init () { - return 0; -} - -int -afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this, - afr_sh_algo_fn sh_data_algo_start) -{ - afr_local_t *sh_local = NULL; - afr_self_heal_t *sh = NULL; afr_sh_algo_private_t *sh_priv = NULL; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - sh_priv = GF_CALLOC (1, sizeof (*sh_priv), gf_afr_mt_afr_private_t); if (!sh_priv) goto out; LOCK_INIT (&sh_priv->lock); +out: + return sh_priv; +} - sh->private = sh_priv; - sh->sh_data_algo_start = sh_data_algo_start; +int +afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, char *dom, + unsigned int child_count) +{ + afr_local_t *dst_local = NULL; + afr_self_heal_t *dst_sh = NULL; + afr_local_t *src_local = NULL; + afr_self_heal_t *src_sh = NULL; + int ret = -1; + + dst_local = dst->local; + dst_sh = &dst_local->self_heal; + src_local = src->local; + src_sh = &src_local->self_heal; + GF_ASSERT (src_sh->data_lock_held); + GF_ASSERT (!dst_sh->data_lock_held); + ret = afr_lk_transfer_datalock (dst, src, dom, child_count); + if (ret) + return ret; + src_sh->data_lock_held = _gf_false; + dst_sh->data_lock_held = _gf_true; + return 0; +} - sh_local->call_count = 0; +int +afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this, + afr_sh_algo_fn sh_data_algo_start) +{ + call_frame_t *first_loop_frame = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int ret = 0; + afr_private_t *priv = NULL; + + local = sh_frame->local; + sh = &local->self_heal; + priv = this->private; - sh->loop_completion_cbk = sh_do_nothing; - sh_loop_driver (sh_frame, this, _gf_true, sh_frame); + sh->sh_data_algo_start = sh_data_algo_start; + local->call_count = 0; + ret = sh_loop_frame_create (sh_frame, this, NULL, &first_loop_frame); + if (ret) + goto out; + ret = afr_sh_transfer_lock (first_loop_frame, sh_frame, this->name, + priv->child_count); + if (ret) + goto out; + sh->private = afr_sh_priv_init (); + if (!sh->private) { + ret = -1; + goto out; + } + sh_loop_driver (sh_frame, this, _gf_true, first_loop_frame); + ret = 0; out: + if (ret) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + sh_loop_driver_done (sh_frame, this, NULL); + } return 0; } diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.h b/xlators/cluster/afr/src/afr-self-heal-algorithm.h index 04d8e8a6c..6b20789b1 100644 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.h +++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.h @@ -1,26 +1,16 @@ /* - Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __AFR_SELF_HEAL_ALGORITHM_H__ #define __AFR_SELF_HEAL_ALGORITHM_H__ - typedef int (*afr_sh_algo_fn) (call_frame_t *frame, xlator_t *this); diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 8f50c6251..ef92b4205 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include "glusterfs.h" @@ -27,6 +18,52 @@ #include "afr-self-heal.h" #include "pump.h" +#define ADD_FMT_STRING(msg, off, sh_str, status, print_log) \ + do { \ + if (AFR_SELF_HEAL_NOT_ATTEMPTED != status) { \ + off += snprintf (msg + off, sizeof (msg) - off, \ + " "sh_str" self heal %s,", \ + get_sh_completion_status (status));\ + print_log = 1; \ + } \ + } while (0) + +#define ADD_FMT_STRING_SYNC(msg, off, sh_str, status, print_log) \ + do { \ + if (AFR_SELF_HEAL_SYNC_BEGIN == status || \ + AFR_SELF_HEAL_FAILED == status) { \ + off += snprintf (msg + off, sizeof (msg) - off, \ + " "sh_str" self heal %s,", \ + get_sh_completion_status (status));\ + print_log = 1; \ + } \ + } while (0) + + +void +afr_sh_reset (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + memset (sh->child_errno, 0, + sizeof (*sh->child_errno) * priv->child_count); + memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count); + memset (sh->parentbufs, 0, + sizeof (*sh->parentbufs) * priv->child_count); + memset (sh->success, 0, sizeof (*sh->success) * priv->child_count); + memset (sh->locked_nodes, 0, + sizeof (*sh->locked_nodes) * priv->child_count); + sh->active_sinks = 0; + + afr_reset_xattr (sh->xattr, priv->child_count); +} + //Intersection[child]=1 if child is part of intersection void afr_children_intersection_get (int32_t *set1, int32_t *set2, @@ -81,21 +118,6 @@ afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this) sh->active_sinks = active_sinks; } -/** - * sink_count - return number of sinks in sources array - */ - -int -afr_sh_sink_count (int sources[], int child_count) -{ - int i = 0; - int sinks = 0; - for (i = 0; i < child_count; i++) - if (!sources[i]) - sinks++; - return sinks; -} - int afr_sh_source_count (int sources[], int child_count) { @@ -112,8 +134,8 @@ void afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno) { sh->op_ret = -1; - if (afr_error_more_important (sh->op_errno, op_errno)) - sh->op_errno = op_errno; + sh->op_errno = afr_most_important_error(sh->op_errno, op_errno, + _gf_false); } void @@ -135,13 +157,85 @@ afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); } sprintf (ptr, "]"); - gf_log (this->name, GF_LOG_DEBUG, - "pending_matrix: %s", buf); + gf_log (this->name, GF_LOG_DEBUG, "pending_matrix: %s", buf); } GF_FREE (buf); } +char* +afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this) +{ + afr_private_t * priv = this->private; + char *buf = NULL; + char *ptr = NULL; + int i = 0; + int j = 0; + int child_count = priv->child_count; + char *matrix_begin = "[ [ "; + char *matrix_end = "] ]"; + char *seperator = "] [ "; + int pending_entry_strlen = 12; //Including space after entry + int matrix_begin_strlen = 0; + int matrix_end_strlen = 0; + int seperator_strlen = 0; + int string_length = 0; + char *msg = "- Pending matrix: "; + + /* + * for a list of lists of [ [ a b ] [ c d ] ] + * */ + + matrix_begin_strlen = strlen (matrix_begin); + matrix_end_strlen = strlen (matrix_end); + seperator_strlen = strlen (seperator); + string_length = matrix_begin_strlen + matrix_end_strlen + + (child_count -1) * seperator_strlen + + (child_count * child_count * pending_entry_strlen); + + buf = GF_CALLOC (1, 1 + strlen (msg) + string_length , gf_afr_mt_char); + if (!buf) + goto out; + + ptr = buf; + ptr += sprintf (ptr, "%s", msg); + ptr += sprintf (ptr, "%s", matrix_begin); + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); + } + if (i < priv->child_count -1) + ptr += sprintf (ptr, "%s", seperator); + } + + ptr += sprintf (ptr, "%s", matrix_end); + +out: + return buf; +} + +void +afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, + const char *loc) +{ + char *buf = NULL; + char *free_ptr = NULL; + + buf = afr_get_pending_matrix_str (pending_matrix, this); + if (buf) + free_ptr = buf; + else + buf = ""; + + + gf_log (this->name, GF_LOG_ERROR, "Unable to self-heal contents of '%s'" + " (possible split-brain). Please delete the file from all but " + "the preferred subvolume.%s", loc, buf); + GF_FREE (free_ptr); + return; +} + + void afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count) { @@ -180,6 +274,7 @@ afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix, int afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, + unsigned char *ignorant_subvols, dict_t *xattr[], afr_transaction_type type, size_t child_count) { @@ -190,12 +285,6 @@ afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, int i = 0; int j = 0; int k = 0; - unsigned char *ignorant_subvols = NULL; - - ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), child_count, - gf_afr_mt_char); - if (NULL == ignorant_subvols) - goto out; afr_init_pending_matrix (pending_matrix, child_count); @@ -213,7 +302,8 @@ afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, * subvolume. */ - ignorant_subvols[i] = 1; + if (ignorant_subvols) + ignorant_subvols[i] = 1; continue; } @@ -224,19 +314,14 @@ afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, } } - afr_mark_ignorant_subvols_as_pending (pending_matrix, - ignorant_subvols, - child_count); - GF_FREE (ignorant_subvols); -out: return ret; } typedef enum { + AFR_NODE_INVALID, AFR_NODE_INNOCENT, AFR_NODE_FOOL, AFR_NODE_WISE, - AFR_NODE_INVALID = -1, } afr_node_type; typedef struct { @@ -316,7 +401,7 @@ afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count) * It is 1 if no other wise node accuses it. * Only wise nodes with wisdom 1 are sources. * - * If no nodes with wisdom 1 exist, a split-brain has occured. + * If no nodes with wisdom 1 exist, a split-brain has occurred. */ static void @@ -416,6 +501,8 @@ afr_find_biggest_witness_among_fools (int32_t *witnesses, { int i = 0; int biggest_witness = -1; + int biggest_witness_idx = -1; + int biggest_witness_cnt = -1; GF_ASSERT (witnesses); GF_ASSERT (characters); @@ -425,10 +512,21 @@ afr_find_biggest_witness_among_fools (int32_t *witnesses, if (characters[i].type != AFR_NODE_FOOL) continue; - if (biggest_witness < witnesses[i]) + if (biggest_witness < witnesses[i]) { biggest_witness = witnesses[i]; + biggest_witness_idx = i; + biggest_witness_cnt = 1; + continue; + } + + if (biggest_witness == witnesses[i]) + biggest_witness_cnt++; } - return biggest_witness; + + if (biggest_witness_cnt != 1) + return -1; + + return biggest_witness_idx; } int @@ -456,10 +554,84 @@ afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses, return nsources; } + +int +afr_mark_fool_as_source_by_idx (int32_t *sources, int child_count, int idx) +{ + if (idx >= 0 && idx < child_count) { + sources[idx] = 1; + return 1; + } + return 0; +} + + +static int +afr_find_largest_file_size (struct iatt *bufs, int32_t *success_children, + int child_count) +{ + int idx = -1; + int i = -1; + int child = -1; + uint64_t max_size = 0; + uint64_t min_size = 0; + int num_children = 0; + + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + + child = success_children[i]; + if (bufs[child].ia_size > max_size) { + max_size = bufs[child].ia_size; + idx = child; + } + + if ((num_children == 0) || (bufs[child].ia_size < min_size)) { + min_size = bufs[child].ia_size; + } + + num_children++; + } + + /* If sizes are same for all of them, finding sources will have to + * happen with pending changelog. So return -1 + */ + if ((num_children > 1) && (min_size == max_size)) + return -1; + return idx; +} + + +static int +afr_find_newest_file (struct iatt *bufs, int32_t *success_children, + int child_count) +{ + int idx = -1; + int i = -1; + int child = -1; + uint64_t max_ctime = 0; + + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + + child = success_children[i]; + if (bufs[child].ia_ctime > max_ctime) { + max_ctime = bufs[child].ia_ctime; + idx = child; + } + } + + return idx; +} + + static int afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, afr_node_character *characters, - int child_count) + int32_t *success_children, + int child_count, struct iatt *bufs) { int32_t biggest_witness = 0; int nsources = 0; @@ -467,6 +639,11 @@ afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, GF_ASSERT (child_count > 0); + biggest_witness = afr_find_largest_file_size (bufs, success_children, + child_count); + if (biggest_witness != -1) + goto found; + witnesses = GF_CALLOC (child_count, sizeof (*witnesses), gf_afr_mt_int32_t); if (NULL == witnesses) { @@ -479,34 +656,34 @@ afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, biggest_witness = afr_find_biggest_witness_among_fools (witnesses, characters, child_count); - nsources = afr_mark_fool_as_source_by_witness (sources, witnesses, - characters, child_count, - biggest_witness); + if (biggest_witness != -1) + goto found; + + biggest_witness = afr_find_newest_file (bufs, success_children, + child_count); + +found: + nsources = afr_mark_fool_as_source_by_idx (sources, child_count, + biggest_witness); out: - if (witnesses) - GF_FREE (witnesses); + GF_FREE (witnesses); return nsources; } int afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs, - int32_t *valid_children, int child_count, - uint32_t uid) + int32_t *success_children, + unsigned int child_count, uint32_t uid) { int i = 0; int nsources = 0; int child = 0; - GF_ASSERT (bufs); - GF_ASSERT (valid_children); - GF_ASSERT (sources); - GF_ASSERT (child_count > 0); - for (i = 0; i < child_count; i++) { - if (-1 == valid_children[i]) - continue; + if (-1 == success_children[i]) + break; - child = valid_children[i]; + child = success_children[i]; if (uid == bufs[child].ia_uid) { sources[child] = 1; nsources++; @@ -516,21 +693,17 @@ afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs, } int -afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *valid_children, - int child_count) +afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *success_children, + unsigned int child_count) { int i = 0; int smallest = -1; int child = 0; - GF_ASSERT (bufs); - GF_ASSERT (valid_children); - GF_ASSERT (child_count > 0); - for (i = 0; i < child_count; i++) { - if (-1 == valid_children[i]) - continue; - child = valid_children[i]; + if (-1 == success_children[i]) + break; + child = success_children[i]; if ((smallest == -1) || (bufs[child].ia_uid < bufs[smallest].ia_uid)) { smallest = child; @@ -540,25 +713,97 @@ afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *valid_children, } static int -afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *valid_children, +afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *success_children, int child_count, int32_t *sources) { int nsources = 0; int smallest = 0; - smallest = afr_get_child_with_lowest_uid (bufs, valid_children, + smallest = afr_get_child_with_lowest_uid (bufs, success_children, child_count); if (smallest < 0) { nsources = -1; goto out; } nsources = afr_mark_child_as_source_by_uid (sources, bufs, - valid_children, child_count, + success_children, child_count, bufs[smallest].ia_uid); out: return nsources; } +int +afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children, + struct iatt *bufs) +{ + afr_private_t *priv = NULL; + int i = 0; + int child = -1; + int read_child = -1; + + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + child = success_children[i]; + if (child < 0) + break; + if (read_child < 0) + read_child = child; + else if (bufs[read_child].ia_size < bufs[child].ia_size) + read_child = child; + } + return read_child; +} + +int +afr_sh_mark_zero_size_file_as_sink (struct iatt *bufs, int32_t *success_children, + int child_count, int32_t *sources) +{ + int nsources = 0; + int i = 0; + int child = 0; + gf_boolean_t sink_exists = _gf_false; + gf_boolean_t source_exists = _gf_false; + int source = -1; + + for (i = 0; i < child_count; i++) { + child = success_children[i]; + if (child < 0) + break; + if (!bufs[child].ia_size) { + sink_exists = _gf_true; + continue; + } + if (!source_exists) { + source_exists = _gf_true; + source = child; + continue; + } + if (bufs[source].ia_size != bufs[child].ia_size) { + nsources = -1; + goto out; + } + } + if (!source_exists && !sink_exists) { + nsources = -1; + goto out; + } + + if (!source_exists || !sink_exists) + goto out; + + for (i = 0; i < child_count; i++) { + child = success_children[i]; + if (child < 0) + break; + if (bufs[child].ia_size) { + sources[child] = 1; + nsources++; + } + } +out: + return nsources; +} + char * afr_get_character_str (afr_node_type type) { @@ -583,12 +828,10 @@ afr_get_character_str (afr_node_type type) afr_node_type afr_find_child_character_type (int32_t *pending_row, int32_t child, - int32_t child_count, const char *xlator_name) + unsigned int child_count) { afr_node_type type = AFR_NODE_INVALID; - GF_ASSERT (pending_row); - GF_ASSERT (child_count > 0); GF_ASSERT ((child >= 0) && (child < child_count)); if (afr_sh_is_innocent (pending_row, child_count)) @@ -597,44 +840,85 @@ afr_find_child_character_type (int32_t *pending_row, int32_t child, type = AFR_NODE_FOOL; else if (afr_sh_is_wise (pending_row, child, child_count)) type = AFR_NODE_WISE; - else - GF_ASSERT (0); - - gf_log (xlator_name, GF_LOG_DEBUG, "child %d character %s", - child, afr_get_character_str (type)); return type; } int afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, int32_t **pending_matrix, int32_t *sources, - int32_t *success_children, afr_transaction_type type) + int32_t *success_children, afr_transaction_type type, + int32_t *subvol_status, gf_boolean_t ignore_ignorant) { afr_private_t *priv = NULL; afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; int nsources = -1; + unsigned char *ignorant_subvols = NULL; + unsigned int child_count = 0; priv = this->private; + child_count = priv->child_count; if (afr_get_children_count (success_children, priv->child_count) == 0) goto out; + if (!ignore_ignorant) { + ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), + child_count, gf_afr_mt_char); + if (NULL == ignorant_subvols) + goto out; + } + afr_build_pending_matrix (priv->pending_key, pending_matrix, - xattr, type, priv->child_count); + ignorant_subvols, xattr, type, + priv->child_count); + if (!ignore_ignorant) + afr_mark_ignorant_subvols_as_pending (pending_matrix, + ignorant_subvols, + priv->child_count); sh_type = afr_self_heal_type_for_transaction (type); if (AFR_SELF_HEAL_INVALID == sh_type) goto out; afr_sh_print_pending_matrix (pending_matrix, this); - nsources = afr_mark_sources (sources, pending_matrix, bufs, - priv->child_count, sh_type, - success_children, this->name); + nsources = afr_mark_sources (this, sources, pending_matrix, bufs, + sh_type, success_children, subvol_status); out: + GF_FREE (ignorant_subvols); return nsources; } +void +afr_find_character_types (afr_node_character *characters, + int32_t **pending_matrix, int32_t *success_children, + unsigned int child_count) +{ + afr_node_type type = AFR_NODE_INVALID; + int child = 0; + int i = 0; + + for (i = 0; i < child_count; i++) { + child = success_children[i]; + if (child == -1) + break; + type = afr_find_child_character_type (pending_matrix[child], + child, child_count); + characters[child].type = type; + } +} + +void +afr_mark_success_children_sources (int32_t *sources, int32_t *success_children, + unsigned int child_count) +{ + int i = 0; + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + sources[success_children[i]] = 1; + } +} /** * mark_sources: Mark all 'source' nodes and return number of source * nodes found @@ -660,17 +944,18 @@ out: */ int -afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, - int32_t child_count, afr_self_heal_type type, - int32_t *valid_children, const char *xlator_name) +afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, + struct iatt *bufs, afr_self_heal_type type, + int32_t *success_children, int32_t *subvol_status) { /* stores the 'characters' (innocent, fool, wise) of the nodes */ - afr_node_character *characters = NULL; - int i = 0; - int nsources = -1; - xlator_t *this = NULL; + int nsources = -1; + unsigned int child_count = 0; + afr_private_t *priv = NULL; + priv = this->private; + child_count = priv->child_count; characters = GF_CALLOC (sizeof (afr_node_character), child_count, gf_afr_mt_afr_node_character); if (!characters) @@ -679,28 +964,29 @@ afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, this = THIS; /* start clean */ - for (i = 0; i < child_count; i++) { - sources[i] = 0; - } - + memset (sources, 0, sizeof (*sources) * child_count); nsources = 0; - for (i = 0; i < child_count; i++) { - characters[i].type = - afr_find_child_character_type (pending_matrix[i], i, - child_count, - xlator_name); - if (AFR_NODE_INVALID == characters[i].type) - gf_log (xlator_name, GF_LOG_WARNING, - "child %d had invalid xattrs", i); - } - - if ((type == AFR_SELF_HEAL_METADATA) - && afr_sh_all_nodes_innocent (characters, child_count)) { - - nsources = afr_sh_mark_lowest_uid_as_source (bufs, - valid_children, + afr_find_character_types (characters, pending_matrix, success_children, + child_count); + if (afr_sh_all_nodes_innocent (characters, child_count)) { + switch (type) { + case AFR_SELF_HEAL_METADATA: + nsources = afr_sh_mark_lowest_uid_as_source (bufs, + success_children, + child_count, + sources); + break; + case AFR_SELF_HEAL_DATA: + nsources = afr_sh_mark_zero_size_file_as_sink (bufs, + success_children, child_count, sources); + if ((nsources < 0) && subvol_status) + *subvol_status |= SPLIT_BRAIN; + break; + default: + break; + } goto out; } @@ -708,32 +994,29 @@ afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, afr_sh_compute_wisdom (pending_matrix, characters, child_count); if (afr_sh_wise_nodes_conflict (characters, child_count)) { - /* split-brain */ - gf_log (this->name, GF_LOG_INFO, - "split-brain possible, no source detected"); + if (subvol_status) + *subvol_status |= SPLIT_BRAIN; nsources = -1; - } else { nsources = afr_sh_mark_wisest_as_sources (sources, characters, child_count); } } else { + if (subvol_status) + *subvol_status |= ALL_FOOLS; nsources = afr_mark_biggest_of_fools_as_source (sources, pending_matrix, characters, - child_count); + success_children, + child_count, bufs); } out: - if (nsources == 0) { - for (i = 0; i < child_count; i++) { - if (valid_children[i] != -1) - sources[valid_children[i]] = 1; - } - } - if (characters) - GF_FREE (characters); + if (nsources == 0) + afr_mark_success_children_sources (sources, success_children, + child_count); + GF_FREE (characters); gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources); return nsources; @@ -744,81 +1027,108 @@ afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, int32_t *delta_matrix[], unsigned char success[], int child_count, afr_transaction_type type) { - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - int ret = 0; - int i = 0; - int j = 0; - int k = 0; - - /* start clean */ - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - delta_matrix[i][j] = 0; - } - } - - for (i = 0; i < child_count; i++) { - if (pending_raw) - pending_raw = NULL; - - for (j = 0; j < child_count; j++) { - ret = dict_get_ptr (xattr[i], priv->pending_key[j], - &pending_raw); - if (ret < 0) - gf_log (THIS->name, GF_LOG_DEBUG, - "Unable to get dict value."); - if (!success[j]) - continue; + int tgt = 0; + int src = 0; + int value = 0; - k = afr_index_for_transaction_type (type); + afr_build_pending_matrix (priv->pending_key, delta_matrix, NULL, + xattr, type, priv->child_count); - if (pending_raw != NULL) { - memcpy (pending, pending_raw, sizeof(pending)); - delta_matrix[i][j] = -(ntoh32 (pending[k])); - } else { - delta_matrix[i][j] = 0; + /* + * The algorithm here has two parts. First, for each subvol indexed + * as tgt, we try to figure out what count everyone should have for it. + * If the self-heal succeeded, that's easy; the value is zero. + * Otherwise, the value is the maximum of the succeeding nodes' counts. + * Once we know the value, we loop through (possibly for a second time) + * setting each count to the difference so that when we're done all + * succeeding nodes will have the same count for tgt. + */ + for (tgt = 0; tgt < priv->child_count; ++tgt) { + value = 0; + if (!success[tgt]) { + /* Find the maximum. */ + for (src = 0; src < priv->child_count; ++src) { + if (!success[src]) { + continue; + } + if (delta_matrix[src][tgt] > value) { + value = delta_matrix[src][tgt]; + } + } + } + /* Force everyone who succeeded to the chosen value. */ + for (src = 0; src < priv->child_count; ++src) { + if (success[src]) { + delta_matrix[src][tgt] = value + - delta_matrix[src][tgt]; + } + else { + delta_matrix[src][tgt] = 0; } - } } } int -afr_sh_delta_to_xattr (afr_private_t *priv, +afr_sh_delta_to_xattr (xlator_t *this, int32_t *delta_matrix[], dict_t *xattr[], int child_count, afr_transaction_type type) { - int i = 0; - int j = 0; - int k = 0; - int ret = 0; - int32_t *pending = NULL; + int i = 0; + int j = 0; + int k = 0; + int ret = 0; + int32_t *pending = NULL; + int32_t *local_pending = NULL; + afr_private_t *priv = NULL; + priv = this->private; for (i = 0; i < child_count; i++) { if (!xattr[i]) continue; + local_pending = NULL; for (j = 0; j < child_count; j++) { pending = GF_CALLOC (sizeof (int32_t), 3, gf_afr_mt_int32_t); - if (!pending) + if (!pending) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate pending entry " + "for %s[%d] on %s", + priv->pending_key[j], type, + priv->children[i]->name); continue; + } /* 3 = data+metadata+entry */ k = afr_index_for_transaction_type (type); pending[k] = hton32 (delta_matrix[i][j]); + if (j == i) { + local_pending = pending; + continue; + } ret = dict_set_bin (xattr[i], priv->pending_key[j], pending, - 3 * sizeof (int32_t)); - if (ret < 0) - gf_log (THIS->name, GF_LOG_WARNING, + AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "Unable to set dict value."); + GF_FREE (pending); + } + } + if (local_pending) { + ret = dict_set_bin (xattr[i], priv->pending_key[i], + local_pending, + AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "Unable to set dict value."); + GF_FREE (local_pending); + } } } return 0; @@ -826,146 +1136,23 @@ afr_sh_delta_to_xattr (afr_private_t *priv, int -afr_sh_has_metadata_pending (dict_t *xattr, xlator_t *this) -{ - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - afr_private_t *priv = NULL; - int ret = -1; - int i = 0; - int j = 0; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); - - if (ret != 0) - return 0; - - memcpy (pending, pending_raw, sizeof(pending)); - j = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); - - if (pending[j]) - return 1; - } - - return 0; -} - - -int -afr_sh_has_data_pending (dict_t *xattr, xlator_t *this) -{ - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - afr_private_t *priv = NULL; - int ret = -1; - int i = 0; - int j = 0; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); - - if (ret != 0) - return 0; - - memcpy (pending, pending_raw, sizeof(pending)); - j = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - - if (pending[j]) - return 1; - } - - return 0; -} - - -int -afr_sh_has_entry_pending (dict_t *xattr, xlator_t *this) -{ - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - afr_private_t *priv = NULL; - int ret = -1; - int i = 0; - int j = 0; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); - - if (ret != 0) - return 0; - - memcpy (pending, pending_raw, sizeof(pending)); - j = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); - - if (pending[j]) - return 1; - } - - return 0; -} - - -/** - * is_matrix_zero - return true if pending matrix is all zeroes - */ - -int -afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count) -{ - int i = 0; - int j = 0; - - for (i = 0; i < child_count; i++) - for (j = 0; j < child_count; j++) - if (pending_matrix[i][j]) - return 0; - return 1; -} - - -int afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; local = frame->local; sh = &local->self_heal; - priv = this->private; -// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); - memset (sh->buf, 0, sizeof (struct iatt) * priv->child_count); - - for (i = 0; i < priv->child_count; i++) { - sh->locked_nodes[i] = 0; - } + afr_sh_reset (frame, this); - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } - - if (local->govinda_gOvinda || sh->op_failed) { - gf_log (this->name, GF_LOG_INFO, + if (local->unhealable) { + gf_log (this->name, GF_LOG_DEBUG, "split brain found, aborting selfheal of %s", local->loc.path); - sh->op_failed = 1; + } + + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { sh->completion_cbk (frame, this); } else { gf_log (this->name, GF_LOG_TRACE, @@ -993,6 +1180,37 @@ afr_sh_missing_entries_finish (call_frame_t *frame, xlator_t *this) return 0; } +int +afr_sh_common_create (afr_self_heal_t *sh, unsigned int child_count) +{ + int ret = -ENOMEM; + sh->buf = GF_CALLOC (child_count, sizeof (*sh->buf), + gf_afr_mt_iatt); + if (!sh->buf) + goto out; + sh->parentbufs = GF_CALLOC (child_count, sizeof (*sh->parentbufs), + gf_afr_mt_iatt); + if (!sh->parentbufs) + goto out; + sh->child_errno = GF_CALLOC (child_count, sizeof (*sh->child_errno), + gf_afr_mt_int); + if (!sh->child_errno) + goto out; + sh->success_children = afr_children_create (child_count); + if (!sh->success_children) + goto out; + sh->fresh_children = afr_children_create (child_count); + if (!sh->fresh_children) + goto out; + sh->xattr = GF_CALLOC (child_count, sizeof (*sh->xattr), + gf_afr_mt_dict_t); + if (!sh->xattr) + goto out; + ret = 0; +out: + return ret; +} + void afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, xlator_t *this, @@ -1020,7 +1238,7 @@ afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, sh->success_count++; sh->xattr[child_index] = dict_ref (xattr); } else { - gf_log (this->name, GF_LOG_ERROR, "path %s on subvolume" + gf_log (this->name, GF_LOG_DEBUG, "path %s on subvolume" " %s => -1 (%s)", loc->path, priv->children[child_index]->name, strerror (op_errno)); @@ -1049,64 +1267,140 @@ afr_valid_ia_type (ia_type_t ia_type) return _gf_false; } +int +afr_impunge_frame_create (call_frame_t *frame, xlator_t *this, + int active_source, call_frame_t **impunge_frame) +{ + afr_local_t *local = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int32_t op_errno = 0; + afr_private_t *priv = NULL; + int ret = 0; + call_frame_t *new_frame = NULL; + + op_errno = ENOMEM; + priv = this->private; + new_frame = copy_frame (frame); + if (!new_frame) { + goto out; + } + + AFR_LOCAL_ALLOC_OR_GOTO (impunge_local, out); + + local = frame->local; + new_frame->local = impunge_local; + impunge_sh = &impunge_local->self_heal; + impunge_sh->sh_frame = frame; + impunge_sh->active_source = active_source; + impunge_local->child_up = memdup (local->child_up, + sizeof (*local->child_up) * + priv->child_count); + if (!impunge_local->child_up) + goto out; + + impunge_local->pending = afr_matrix_create (priv->child_count, + AFR_NUM_CHANGE_LOGS); + if (!impunge_local->pending) + goto out; + + ret = afr_sh_common_create (impunge_sh, priv->child_count); + if (ret) { + op_errno = -ret; + goto out; + } + op_errno = 0; + *impunge_frame = new_frame; +out: + if (op_errno && new_frame) + AFR_STACK_DESTROY (new_frame); + return -op_errno; +} + void -afr_sh_call_entry_impunge_recreate (call_frame_t *frame, xlator_t *this, - int child_index, struct iatt *buf, - struct iatt *postparent, - afr_impunge_done_cbk_t impunge_done) +afr_sh_missing_entry_call_impunge_recreate (call_frame_t *frame, xlator_t *this, + struct iatt *buf, + struct iatt *postparent, + afr_impunge_done_cbk_t impunge_done) { call_frame_t *impunge_frame = NULL; afr_local_t *local = NULL; afr_local_t *impunge_local = NULL; afr_self_heal_t *sh = NULL; afr_self_heal_t *impunge_sh = NULL; + int ret = 0; + unsigned int enoent_count = 0; + afr_private_t *priv = NULL; + int i = 0; int32_t op_errno = 0; - impunge_frame = copy_frame (frame); - if (!impunge_frame) { - op_errno = ENOMEM; + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + enoent_count = afr_errno_count (NULL, sh->child_errno, + priv->child_count, ENOENT); + if (!enoent_count) { + gf_log (this->name, GF_LOG_INFO, + "no missing files - %s. proceeding to metadata check", + local->loc.path); goto out; } - - ALLOC_OR_GOTO (impunge_local, afr_local_t, out); - - local = frame->local; - sh = &local->self_heal; - impunge_frame->local = impunge_local; - impunge_sh = &impunge_local->self_heal; - impunge_sh->sh_frame = frame; - impunge_sh->active_source = sh->source; - impunge_sh->impunging_entry_mode = st_mode_from_ia (buf->ia_prot, - buf->ia_type); - impunge_sh->impunge_ret_child = child_index; - loc_copy (&impunge_local->loc, &local->loc); sh->impunge_done = impunge_done; - impunge_local->call_count = 1; - afr_sh_entry_impunge_create (impunge_frame, this, child_index, buf, - postparent); + ret = afr_impunge_frame_create (frame, this, sh->source, &impunge_frame); + if (ret) + goto out; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + loc_copy (&impunge_local->loc, &local->loc); + ret = afr_build_parent_loc (&impunge_sh->parent_loc, + &impunge_local->loc, &op_errno); + if (ret) { + ret = -op_errno; + goto out; + } + impunge_local->call_count = enoent_count; + impunge_sh->entrybuf = sh->buf[sh->source]; + impunge_sh->parentbuf = sh->parentbufs[sh->source]; + for (i = 0; i < priv->child_count; i++) { + if (!impunge_local->child_up[i]) { + impunge_sh->child_errno[i] = ENOTCONN; + continue; + } + if (sh->child_errno[i] != ENOENT) { + impunge_sh->child_errno[i] = EEXIST; + continue; + } + } + for (i = 0; i < priv->child_count; i++) { + if (sh->child_errno[i] != ENOENT) + continue; + afr_sh_entry_impunge_create (impunge_frame, this, i); + enoent_count--; + } + GF_ASSERT (!enoent_count); return; out: - gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, reason: %s", - local->loc.path, strerror (op_errno)); - impunge_done (frame, this, child_index, -1, op_errno); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, " + "reason: %s", local->loc.path, strerror (-ret)); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + } + afr_sh_missing_entries_finish (frame, this); } int -afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this, int child, +afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno) { - int call_count = 0; afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; local = frame->local; - - if (op_ret == -1) - gf_log (this->name, GF_LOG_ERROR, - "create entry %s failed, on child %d reason, %s", - local->loc.path, child, strerror (op_errno)); - call_count = afr_frame_return (frame); - if (call_count == 0) - afr_sh_missing_entries_finish (frame, this); + sh = &local->self_heal; + if (op_ret < 0) + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_missing_entries_finish (frame, this); return 0; } @@ -1116,26 +1410,11 @@ sh_missing_entries_create (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; int type = 0; - afr_private_t *priv = NULL; - int enoent_count = 0; - int i = 0; struct iatt *buf = NULL; struct iatt *postparent = NULL; local = frame->local; sh = &local->self_heal; - priv = this->private; - - enoent_count = afr_errno_count (NULL, sh->child_errno, - priv->child_count, ENOENT); - if (enoent_count == 0) { - gf_log (this->name, GF_LOG_INFO, - "no missing files - %s. proceeding to metadata check", - local->loc.path); - /* proceed to next step - metadata self-heal */ - afr_sh_missing_entries_finish (frame, this); - return 0; - } buf = &sh->buf[sh->source]; postparent = &sh->parentbufs[sh->source]; @@ -1144,72 +1423,80 @@ sh_missing_entries_create (call_frame_t *frame, xlator_t *this) if (!afr_valid_ia_type (type)) { gf_log (this->name, GF_LOG_ERROR, "%s: unknown file type: 0%o", local->loc.path, type); - local->govinda_gOvinda = 1; + afr_set_local_for_unhealable (local); afr_sh_missing_entries_finish (frame, this); goto out; } - local->call_count = enoent_count; - for (i = 0; i < priv->child_count; i++) { - //If !child_up errno will be zero - if (sh->child_errno[i] != ENOENT) - continue; - afr_sh_call_entry_impunge_recreate (frame, this, i, + afr_sh_missing_entry_call_impunge_recreate (frame, this, buf, postparent, afr_sh_create_entry_cbk); - enoent_count--; - } - GF_ASSERT (enoent_count == 0); out: return 0; } void -afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this) +afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this, + int32_t op_ret, int32_t op_errno) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; - int32_t op_errno = 0; ia_type_t ia_type = IA_INVAL; int32_t nsources = 0; + loc_t *loc = NULL; + int32_t subvol_status = 0; + afr_transaction_type txn_type = AFR_DATA_TRANSACTION; + gf_boolean_t split_brain = _gf_false; + int read_child = -1; local = frame->local; sh = &local->self_heal; priv = this->private; + loc = &local->loc; - if (afr_get_children_count (sh->success_children, - priv->child_count) == 0) { - op_errno = afr_resultant_errno_get (NULL, sh->child_errno, - priv->child_count); - goto out; - } - - if (afr_gfid_missing_count (this->name, sh->success_children, - sh->buf, priv->child_count, - local->loc.path) || - afr_conflicting_iattrs (sh->buf, sh->success_children, - priv->child_count, local->loc.path, - this->name)) { - //this can happen if finding the fresh parent dir failed - local->govinda_gOvinda = 1; - sh->op_failed = 1; - op_errno = EIO; + if (op_ret < 0) { + if (op_errno == EIO) { + afr_set_local_for_unhealable (local); + } + // EIO can happen if finding the fresh parent dir failed goto out; } //now No chance for the ia_type to conflict ia_type = sh->buf[sh->success_children[0]].ia_type; + txn_type = afr_transaction_type_get (ia_type); nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, sh->sources, - sh->success_children, - afr_transaction_type_get (ia_type)); + sh->success_children, txn_type, + &subvol_status, _gf_false); if (nsources < 0) { gf_log (this->name, GF_LOG_INFO, "No sources for dir of %s," " in missing entry self-heal, continuing with the rest" " of the self-heals", local->loc.path); - op_errno = EIO; - goto out; + if (subvol_status & SPLIT_BRAIN) { + split_brain = _gf_true; + switch (txn_type) { + case AFR_DATA_TRANSACTION: + nsources = 1; + sh->sources[sh->success_children[0]] = 1; + break; + case AFR_ENTRY_TRANSACTION: + read_child = afr_get_no_xattr_dir_read_child + (this, + sh->success_children, + sh->buf); + sh->sources[read_child] = 1; + nsources = 1; + break; + default: + op_errno = EIO; + goto out; + } + } else { + op_errno = EIO; + goto out; + } } afr_get_fresh_children (sh->success_children, sh->sources, @@ -1224,34 +1511,77 @@ afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this) if (sh->gfid_sh_success_cbk) sh->gfid_sh_success_cbk (frame, this); sh->type = sh->buf[sh->source].ia_type; - sh_missing_entries_create (frame, this); + if (uuid_is_null (loc->inode->gfid)) + uuid_copy (loc->gfid, sh->buf[sh->source].ia_gfid); + if (split_brain) { + afr_sh_missing_entries_finish (frame, this); + } else { + sh_missing_entries_create (frame, this); + } return; out: + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); afr_sh_missing_entries_finish (frame, this); return; } static int -afr_sh_missing_entries_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent) +afr_sh_common_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xattr, + struct iatt *postparent) { int call_count = 0; afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; local = frame->local; + sh = &local->self_heal; + priv = this->private; afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, op_errno, inode, buf, xattr, - postparent, &local->loc); + postparent, &sh->lookup_loc); call_count = afr_frame_return (frame); - if (call_count == 0) - afr_sh_missing_entries_lookup_done (frame, this); + if (call_count) + goto out; + op_ret = -1; + if (!sh->success_count) { + op_errno = afr_resultant_errno_get (NULL, sh->child_errno, + priv->child_count); + gf_log (this->name, GF_LOG_ERROR, "Failed to lookup %s, " + "reason %s", sh->lookup_loc.path, + strerror (op_errno)); + goto done; + } + if ((sh->lookup_flags & AFR_LOOKUP_FAIL_CONFLICTS) && + (afr_conflicting_iattrs (sh->buf, sh->success_children, + priv->child_count, + sh->lookup_loc.path, this->name))) { + op_errno = EIO; + gf_log (this->name, GF_LOG_ERROR, "Conflicting entries " + "for %s", sh->lookup_loc.path); + goto done; + } + + if ((sh->lookup_flags & AFR_LOOKUP_FAIL_MISSING_GFIDS) && + (afr_gfid_missing_count (this->name, sh->success_children, + sh->buf, priv->child_count, + sh->lookup_loc.path))) { + op_errno = ENODATA; + gf_log (this->name, GF_LOG_ERROR, "Missing Gfids " + "for %s", sh->lookup_loc.path); + goto done; + } + op_ret = 0; + +done: + sh->lookup_done (frame, this, op_ret, op_errno); +out: return 0; } @@ -1274,7 +1604,7 @@ afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child, LOCK (&frame->lock); { afr_sh_set_error (sh, EIO); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } UNLOCK (&frame->lock); } @@ -1287,6 +1617,7 @@ afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child, void afr_sh_call_entry_expunge_remove (call_frame_t *frame, xlator_t *this, int child_index, struct iatt *buf, + struct iatt *parentbuf, afr_expunge_done_cbk_t expunge_done) { call_frame_t *expunge_frame = NULL; @@ -1295,13 +1626,14 @@ afr_sh_call_entry_expunge_remove (call_frame_t *frame, xlator_t *this, afr_self_heal_t *sh = NULL; afr_self_heal_t *expunge_sh = NULL; int32_t op_errno = 0; + int ret = 0; expunge_frame = copy_frame (frame); if (!expunge_frame) { goto out; } - ALLOC_OR_GOTO (expunge_local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out); local = frame->local; sh = &local->self_heal; @@ -1309,8 +1641,15 @@ afr_sh_call_entry_expunge_remove (call_frame_t *frame, xlator_t *this, expunge_sh = &expunge_local->self_heal; expunge_sh->sh_frame = frame; loc_copy (&expunge_local->loc, &local->loc); + ret = afr_build_parent_loc (&expunge_sh->parent_loc, + &expunge_local->loc, &op_errno); + if (ret) { + ret = -op_errno; + goto out; + } sh->expunge_done = expunge_done; - afr_sh_entry_expunge_remove (expunge_frame, this, child_index, buf); + afr_sh_entry_expunge_remove (expunge_frame, this, child_index, buf, + parentbuf); return; out: gf_log (this->name, GF_LOG_ERROR, "Expunge of %s failed, reason: %s", @@ -1347,15 +1686,18 @@ afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; priv = this->private; - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { afr_sh_missing_entries_finish (frame, this); } else { if (afr_gfid_missing_count (this->name, sh->fresh_children, sh->buf, priv->child_count, local->loc.path)) { afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_missing_entries_lookup_cbk, - _gf_true); + afr_sh_missing_entries_lookup_done, + sh->sh_gfid_req, + AFR_LOOKUP_FAIL_CONFLICTS| + AFR_LOOKUP_FAIL_MISSING_GFIDS, + NULL); } else { //No need to set gfid so goto missing entries lookup done //Behave as if you have done the lookup @@ -1366,7 +1708,7 @@ afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this) afr_children_copy (sh->success_children, sh->fresh_children, priv->child_count); - afr_sh_missing_entries_lookup_done (frame, this); + afr_sh_missing_entries_lookup_done (frame, this, 0, 0); } } return 0; @@ -1437,9 +1779,10 @@ afr_sh_purge_entry_common (call_frame_t *frame, xlator_t *this, if (!purge_condition (local, priv, i)) continue; gf_log (this->name, GF_LOG_INFO, "purging the stale entry %s " - "on %d", local->loc.path, i); + "on %s", local->loc.path, priv->children[i]->name); afr_sh_call_entry_expunge_remove (frame, this, (long) i, &sh->buf[i], + &sh->parentbufs[i], afr_sh_remove_entry_cbk); } out: @@ -1521,35 +1864,34 @@ afr_sh_save_child_iatts_from_policy (int32_t *children, struct iatt *bufs, } void -afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this) +afr_get_children_of_fresh_parent_dirs (afr_self_heal_t *sh, + unsigned int child_count) +{ + afr_children_intersection_get (sh->success_children, + sh->fresh_parent_dirs, + sh->sources, child_count); + afr_get_fresh_children (sh->success_children, sh->sources, + sh->fresh_children, child_count); + memset (sh->sources, 0, sizeof (*sh->sources) * child_count); +} + +void +afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this, + int32_t op_ret, int32_t op_errno) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; int32_t fresh_child_enoents = 0; int32_t fresh_parent_count = 0; - int32_t op_errno = 0; local = frame->local; sh = &local->self_heal; priv = this->private; - if (afr_get_children_count (sh->success_children, - priv->child_count) == 0) { - op_errno = afr_resultant_errno_get (NULL, sh->child_errno, - priv->child_count); + if (op_ret < 0) goto fail; - } - - //make intersection of (success_children & fresh_parent_dirs) fresh_children - //the other success_children will be added to it if they are not stale - afr_children_intersection_get (sh->success_children, - sh->fresh_parent_dirs, - sh->sources, priv->child_count); - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); - memset (sh->sources, 0, sizeof (*sh->sources) * priv->child_count); - + afr_get_children_of_fresh_parent_dirs (sh, priv->child_count); fresh_parent_count = afr_get_children_count (sh->fresh_parent_dirs, priv->child_count); //we need the enoent count of the subvols present in fresh_parent_dirs @@ -1557,10 +1899,8 @@ afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this) sh->child_errno, priv->child_count, ENOENT); if (fresh_child_enoents == fresh_parent_count) { - gf_log (this->name, GF_LOG_INFO, "Deleting stale file %s", - local->loc.path); afr_sh_set_error (sh, ENOENT); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_purge_entry (frame, this); } else if (!afr_conflicting_iattrs (sh->buf, sh->fresh_children, priv->child_count, local->loc.path, @@ -1574,42 +1914,22 @@ afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this) afr_sh_purge_stale_entry (frame, this); } else { op_errno = EIO; - local->govinda_gOvinda = 1; + afr_set_local_for_unhealable (local); goto fail; } return; fail: + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); afr_sh_missing_entries_finish (frame, this); return; } -static int -afr_sh_children_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - int call_count = 0; - afr_local_t *local = NULL; - - local = frame->local; - - afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, - op_errno, inode, buf, xattr, - postparent, &local->loc); - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_children_lookup_done (frame, this); - - return 0; -} - -static int -afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this) +static void +afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this, + int32_t op_ret, int32_t op_errno) { afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; @@ -1617,54 +1937,42 @@ afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this) int enoent_count = 0; int nsources = 0; int source = -1; + int32_t subvol_status = 0; local = frame->local; sh = &local->self_heal; priv = this->private; - /* If We can't find a fresh parent directory here, - * we wont know which subvol is correct without finding a parent dir - * upwards which has correct xattrs, for that we may have to - * do lookups till root, we dont wanna do that, - * instead make sure that if there are conflicting gfid - * parent dirs, self-heal thus lookup is failed with EIO. - * if there are missing entries we dont know whether to delete or - * create so fail with EIO, - * If there are conflicting xattr fail with EIO. - */ - if (afr_get_children_count (sh->success_children, - priv->child_count) == 0) { - gf_log (this->name, GF_LOG_ERROR, "Parent dir lookup failed " - "for %s, in missing entry self-heal, continuing with " - "the rest of the self-heals", local->loc.path); + if (op_ret < 0) goto out; - } - enoent_count = afr_errno_count (NULL, sh->child_errno, priv->child_count, ENOENT); if (enoent_count > 0) { gf_log (this->name, GF_LOG_INFO, "Parent dir missing for %s," - " in missing entry self-heal, continuing with the rest" - " of the self-heals", local->loc.path); - goto out; - } - - if (afr_conflicting_iattrs (sh->buf, sh->success_children, - priv->child_count, sh->parent_loc.path, - this->name)) { - gf_log (this->name, GF_LOG_INFO, "conflicting stat info for " - "parent dirs of %s", local->loc.path); - goto out; + " in missing entry self-heal, aborting missing-entry " + "self-heal", + local->loc.path); + afr_sh_missing_entries_finish (frame, this); + return; } nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, sh->sources, sh->success_children, - AFR_ENTRY_TRANSACTION); - if (nsources < 0) { - gf_log (this->name, GF_LOG_INFO, "No sources for dir of %s," - " in missing entry self-heal, continuing with the rest" - " of the self-heals", local->loc.path); + AFR_ENTRY_TRANSACTION, &subvol_status, + _gf_true); + if ((subvol_status & ALL_FOOLS) || + (subvol_status & SPLIT_BRAIN)) { + gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative " + "merge", sh->parent_loc.path); + afr_mark_success_children_sources (sh->sources, + sh->success_children, + priv->child_count); + } else if (nsources < 0) { + gf_log (this->name, GF_LOG_ERROR, "No sources for dir " + "of %s, in missing entry self-heal, aborting " + "self-heal", local->loc.path); + op_errno = EIO; goto out; } @@ -1672,44 +1980,21 @@ afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this) if (source == -1) { GF_ASSERT (0); gf_log (this->name, GF_LOG_DEBUG, "No active sources found."); + op_errno = EIO; goto out; } afr_get_fresh_children (sh->success_children, sh->sources, sh->fresh_parent_dirs, priv->child_count); afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_children_lookup_cbk, _gf_false); - return 0; + afr_sh_children_lookup_done, NULL, 0, + NULL); + return; out: - afr_sh_set_error (sh, EIO); - sh->op_failed = 1; - afr_sh_missing_entries_finish (frame, this); - return 0; -} - -int -afr_sh_conflicting_entry_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, - dict_t *xattr, struct iatt *postparent) -{ - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, - op_errno, inode, buf, xattr, - postparent, &sh->parent_loc); - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_find_fresh_parents (frame, this); - - return 0; + afr_sh_set_error (sh, op_errno); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_missing_entries_finish (frame, this); + return; } void @@ -1727,6 +2012,7 @@ afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count) afr_reset_children (sh->success_children, child_count); afr_reset_children (sh->fresh_children, child_count); afr_reset_xattr (sh->xattr, child_count); + loc_wipe (&sh->lookup_loc); } /* afr self-heal state will be lost if this call is made @@ -1734,7 +2020,8 @@ afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count) */ int afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - afr_lookup_cbk_t lookup_cbk, gf_boolean_t set_gfid) + afr_lookup_done_cbk_t lookup_done , uuid_t gfid, + int32_t flags, dict_t *xdata) { afr_local_t *local = NULL; int i = 0; @@ -1755,16 +2042,19 @@ afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, if (xattr_req) { afr_xattr_req_prepare (this, xattr_req, loc->path); - if (set_gfid) { + if (gfid) { gf_log (this->name, GF_LOG_DEBUG, "looking up %s with gfid: %s", - loc->path, uuid_utoa (sh->sh_gfid_req)); - GF_ASSERT (!uuid_is_null (sh->sh_gfid_req)); - afr_set_dict_gfid (xattr_req, sh->sh_gfid_req); + loc->path, uuid_utoa (gfid)); + GF_ASSERT (!uuid_is_null (gfid)); + afr_set_dict_gfid (xattr_req, gfid); } } afr_sh_common_reset (sh, priv->child_count); + sh->lookup_done = lookup_done; + loc_copy (&sh->lookup_loc, loc); + sh->lookup_flags = flags; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { gf_log (this->name, GF_LOG_DEBUG, @@ -1772,7 +2062,7 @@ afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, loc->path, priv->children[i]->name); STACK_WIND_COOKIE (frame, - lookup_cbk, + afr_sh_common_lookup_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->lookup, @@ -1792,7 +2082,8 @@ afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, int -afr_sh_post_nb_entrylk_conflicting_sh_cbk (call_frame_t *frame, xlator_t *this) +afr_sh_post_nb_entrylk_missing_entry_sh_cbk (call_frame_t *frame, + xlator_t *this) { afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; @@ -1805,38 +2096,16 @@ afr_sh_post_nb_entrylk_conflicting_sh_cbk (call_frame_t *frame, xlator_t *this) if (int_lock->lock_op_ret < 0) { gf_log (this->name, GF_LOG_INFO, "Non blocking entrylks failed."); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_missing_entries_done (frame, this); } else { gf_log (this->name, GF_LOG_DEBUG, "Non blocking entrylks done. Proceeding to FOP"); afr_sh_common_lookup (frame, this, &sh->parent_loc, - afr_sh_conflicting_entry_lookup_cbk, - _gf_false); - } - - return 0; -} - -int -afr_sh_post_nb_entrylk_gfid_sh_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "Non blocking entrylks failed."); - afr_sh_missing_entries_done (frame, this); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking entrylks done. Proceeding to FOP"); - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_missing_entries_lookup_cbk, - _gf_true); + afr_sh_find_fresh_parents, + NULL, AFR_LOOKUP_FAIL_CONFLICTS, + NULL); } return 0; @@ -1848,7 +2117,9 @@ afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, { afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; + afr_private_t *priv = NULL; + priv = this->private; local = frame->local; int_lock = &local->internal_lock; @@ -1860,7 +2131,12 @@ afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, int_lock->lk_basename = base_name; int_lock->lk_loc = loc; int_lock->lock_cbk = lock_cbk; + int_lock->domain = this->name; + int_lock->lockee_count = 0; + afr_init_entry_lockee (&int_lock->lockee[0], local, loc, + base_name, priv->child_count); + int_lock->lockee_count++; afr_nonblocking_entrylk (frame, this); return 0; @@ -1872,6 +2148,9 @@ afr_self_heal_parent_entrylk (call_frame_t *frame, xlator_t *this, { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; + afr_internal_lock_t *int_lock = NULL; + int ret = -1; + int32_t op_errno = 0; local = frame->local; sh = &local->self_heal; @@ -1880,43 +2159,52 @@ afr_self_heal_parent_entrylk (call_frame_t *frame, xlator_t *this, "attempting to recreate missing entries for path=%s", local->loc.path); - GF_ASSERT (local->loc.parent); - afr_build_parent_loc (&sh->parent_loc, &local->loc); + ret = afr_build_parent_loc (&sh->parent_loc, &local->loc, &op_errno); + if (ret) + goto out; afr_sh_entrylk (frame, this, &sh->parent_loc, NULL, lock_cbk); return 0; -} - -static int -afr_self_heal_conflicting_entries (call_frame_t *frame, xlator_t *this) -{ - afr_self_heal_parent_entrylk (frame, this, - afr_sh_post_nb_entrylk_conflicting_sh_cbk); +out: + int_lock = &local->internal_lock; + int_lock->lock_op_ret = -1; + lock_cbk (frame, this); return 0; } static int -afr_self_heal_gfids (call_frame_t *frame, xlator_t *this) +afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) { + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + sh->sh_type_in_action = AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY; + + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); + afr_self_heal_parent_entrylk (frame, this, - afr_sh_post_nb_entrylk_gfid_sh_cbk); + afr_sh_post_nb_entrylk_missing_entry_sh_cbk); return 0; } -afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) +afr_local_t* +afr_self_heal_local_init (afr_local_t *l, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *lc = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *shc = NULL; + afr_private_t *priv = NULL; + afr_local_t *lc = NULL; + afr_self_heal_t *sh = NULL; + afr_self_heal_t *shc = NULL; + int ret = 0; priv = this->private; sh = &l->self_heal; - lc = GF_CALLOC (1, sizeof (afr_local_t), - gf_afr_mt_afr_local_t); + lc = mem_get0 (this->local_pool); if (!lc) goto out; @@ -1929,17 +2217,27 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) shc->do_data_self_heal = sh->do_data_self_heal; shc->do_metadata_self_heal = sh->do_metadata_self_heal; shc->do_entry_self_heal = sh->do_entry_self_heal; + shc->force_confirm_spb = sh->force_confirm_spb; shc->forced_merge = sh->forced_merge; - shc->data_lock_held = sh->data_lock_held; shc->background = sh->background; shc->type = sh->type; + shc->data_sh_info = ""; + shc->metadata_sh_info = ""; uuid_copy (shc->sh_gfid_req, sh->sh_gfid_req); - if (l->loc.path) - loc_copy (&lc->loc, &l->loc); + if (l->loc.path) { + ret = loc_copy (&lc->loc, &l->loc); + if (ret < 0) + goto out; + } lc->child_up = memdup (l->child_up, sizeof (*lc->child_up) * priv->child_count); + if (!lc->child_up) { + ret = -1; + goto out; + } + if (l->xattr_req) lc->xattr_req = dict_ref (l->xattr_req); @@ -1947,40 +2245,25 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode); if (l->cont.lookup.xattr) lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr); - if (l->internal_lock.inode_locked_nodes) - lc->internal_lock.inode_locked_nodes = - memdup (l->internal_lock.inode_locked_nodes, - sizeof (*lc->internal_lock.inode_locked_nodes) * priv->child_count); - else - lc->internal_lock.inode_locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.inode_locked_nodes), - priv->child_count, - gf_afr_mt_char); - if (l->internal_lock.entry_locked_nodes) - lc->internal_lock.entry_locked_nodes = - memdup (l->internal_lock.entry_locked_nodes, - sizeof (*lc->internal_lock.entry_locked_nodes) * priv->child_count); - else - lc->internal_lock.entry_locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.entry_locked_nodes), - priv->child_count, - gf_afr_mt_char); - if (l->internal_lock.locked_nodes) - lc->internal_lock.locked_nodes = - memdup (l->internal_lock.locked_nodes, - sizeof (*lc->internal_lock.locked_nodes) * priv->child_count); - else - lc->internal_lock.locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), - priv->child_count, - gf_afr_mt_char); - lc->internal_lock.inodelk_lock_count = - l->internal_lock.inodelk_lock_count; - lc->internal_lock.entrylk_lock_count = - l->internal_lock.entrylk_lock_count; + lc->internal_lock.locked_nodes = + GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), + priv->child_count, gf_afr_mt_char); + if (!lc->internal_lock.locked_nodes) { + ret = -1; + goto out; + } + + ret = afr_inodelk_init (&lc->internal_lock.inodelk[0], + this->name, priv->child_count); + if (ret) + goto out; out: + if (ret) { + afr_local_cleanup (lc, this); + lc = NULL; + } return lc; } @@ -1990,32 +2273,39 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) afr_private_t * priv = NULL; afr_local_t * local = NULL; afr_self_heal_t * sh = NULL; + afr_local_t * orig_frame_local = NULL; + afr_self_heal_t * orig_frame_sh = NULL; char sh_type_str[256] = {0,}; - gf_boolean_t split_brain = _gf_false; + gf_loglevel_t loglevel = 0; priv = this->private; local = bgsh_frame->local; sh = &local->self_heal; - if (local->govinda_gOvinda) - split_brain = _gf_true; - - afr_set_split_brain (this, sh->inode, split_brain); + if (local->unhealable) { + afr_set_split_brain (this, sh->inode, SPB, SPB); + } afr_self_heal_type_str_get (sh, sh_type_str, sizeof(sh_type_str)); - if (sh->op_failed) { - gf_log (this->name, GF_LOG_ERROR, "background %s self-heal " - "failed on %s", sh_type_str, local->loc.path); + if (is_self_heal_failed (sh, AFR_CHECK_ALL) && !priv->shd.iamshd) { + loglevel = GF_LOG_ERROR; + } else if (!is_self_heal_failed (sh, AFR_CHECK_ALL)) { + loglevel = GF_LOG_INFO; } else { - gf_log (this->name, GF_LOG_INFO, "background %s self-heal " - "completed on %s", sh_type_str, local->loc.path); + loglevel = GF_LOG_DEBUG; } + afr_log_self_heal_completion_status (local, loglevel); + FRAME_SU_UNDO (bgsh_frame, afr_local_t); if (!sh->unwound && sh->unwind) { - sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno); + orig_frame_local = sh->orig_frame->local; + orig_frame_sh = &orig_frame_local->self_heal; + orig_frame_sh->actual_sh_started = _gf_true; + sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, + is_self_heal_failed (sh, AFR_CHECK_ALL)); } if (sh->background) { @@ -2037,31 +2327,19 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; - int i = 0; - - call_frame_t *sh_frame = NULL; - afr_local_t *sh_local = NULL; + int32_t op_errno = 0; + int ret = 0; + afr_self_heal_t *orig_sh = NULL; + call_frame_t *sh_frame = NULL; + afr_local_t *sh_local = NULL; + loc_t *loc = NULL; local = frame->local; + orig_sh = &local->self_heal; priv = this->private; GF_ASSERT (local->loc.path); - if (local->self_heal.background) { - LOCK (&priv->lock); - { - if (priv->background_self_heals_started - < priv->background_self_heal_count) { - priv->background_self_heals_started++; - - - } else { - local->self_heal.background = _gf_false; - } - } - UNLOCK (&priv->lock); - } - gf_log (this->name, GF_LOG_TRACE, "performing self heal on %s (metadata=%d data=%d entry=%d)", local->loc.path, @@ -2069,71 +2347,105 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) local->self_heal.do_data_self_heal, local->self_heal.do_entry_self_heal); + op_errno = ENOMEM; sh_frame = copy_frame (frame); - afr_set_lk_owner (sh_frame, this); + if (!sh_frame) + goto out; + afr_set_lk_owner (sh_frame, this, sh_frame->root); afr_set_low_priority (sh_frame); - sh_local = afr_local_copy (local, this); + sh_local = afr_self_heal_local_init (local, this); + if (!sh_local) + goto out; sh_frame->local = sh_local; sh = &sh_local->self_heal; sh->inode = inode_ref (inode); - sh->orig_frame = frame; sh->completion_cbk = afr_self_heal_completion_cbk; - sh->buf = GF_CALLOC (priv->child_count, sizeof (struct iatt), - gf_afr_mt_iatt); - sh->parentbufs = GF_CALLOC (priv->child_count, sizeof (struct iatt), - gf_afr_mt_iatt); - sh->child_errno = GF_CALLOC (priv->child_count, sizeof (int), - gf_afr_mt_int); sh->success = GF_CALLOC (priv->child_count, sizeof (*sh->success), gf_afr_mt_char); - sh->xattr = GF_CALLOC (priv->child_count, sizeof (dict_t *), - gf_afr_mt_dict_t); + if (!sh->success) + goto out; sh->sources = GF_CALLOC (sizeof (*sh->sources), priv->child_count, gf_afr_mt_int); + if (!sh->sources) + goto out; sh->locked_nodes = GF_CALLOC (sizeof (*sh->locked_nodes), priv->child_count, gf_afr_mt_int); + if (!sh->locked_nodes) + goto out; - sh->pending_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count, - gf_afr_mt_int32_t); + sh->pending_matrix = afr_matrix_create (priv->child_count, + priv->child_count); + if (!sh->pending_matrix) + goto out; - for (i = 0; i < priv->child_count; i++) { - sh->pending_matrix[i] = GF_CALLOC (sizeof (int32_t), - priv->child_count, - gf_afr_mt_int32_t); + sh->delta_matrix = afr_matrix_create (priv->child_count, + priv->child_count); + if (!sh->delta_matrix) + goto out; + + sh->fresh_parent_dirs = afr_children_create (priv->child_count); + if (!sh->fresh_parent_dirs) + goto out; + ret = afr_sh_common_create (sh, priv->child_count); + if (ret) { + op_errno = -ret; + goto out; } - sh->delta_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count, - gf_afr_mt_int32_t); - for (i = 0; i < priv->child_count; i++) { - sh->delta_matrix[i] = GF_CALLOC (sizeof (int32_t), - priv->child_count, - gf_afr_mt_int32_t); + if (local->self_heal.background) { + LOCK (&priv->lock); + { + if (priv->background_self_heals_started + < priv->background_self_heal_count) { + priv->background_self_heals_started++; + + + } else { + local->self_heal.background = _gf_false; + sh->background = _gf_false; + } + } + UNLOCK (&priv->lock); + } + + if (!local->loc.parent) { + sh->do_missing_entry_self_heal = _gf_false; + sh->do_gfid_self_heal = _gf_false; } - sh->success_children = afr_children_create (priv->child_count); - sh->fresh_children = afr_children_create (priv->child_count); - sh->fresh_parent_dirs = afr_children_create (priv->child_count); + sh->sh_type_in_action = AFR_SELF_HEAL_INVALID; FRAME_SU_DO (sh_frame, afr_local_t); - if (sh->do_missing_entry_self_heal) { - afr_self_heal_conflicting_entries (sh_frame, this); - } else if (sh->do_gfid_self_heal) { - GF_ASSERT (!uuid_is_null (sh->sh_gfid_req)); - afr_self_heal_gfids (sh_frame, this); + if (sh->do_missing_entry_self_heal || sh->do_gfid_self_heal) { + afr_self_heal_missing_entries (sh_frame, this); } else { + loc = &sh_local->loc; + if (uuid_is_null (loc->inode->gfid) && uuid_is_null (loc->gfid)) { + if (!uuid_is_null (inode->gfid)) + GF_ASSERT (!uuid_compare (inode->gfid, + sh->sh_gfid_req)); + uuid_copy (loc->gfid, sh->sh_gfid_req); + } gf_log (this->name, GF_LOG_TRACE, "proceeding to metadata check on %s", local->loc.path); afr_sh_missing_entries_done (sh_frame, this); } + op_errno = 0; +out: + if (op_errno) { + orig_sh->unwind (frame, this, -1, op_errno, 1); + if (sh_frame) + AFR_STACK_DESTROY (sh_frame); + } return 0; } @@ -2187,3 +2499,314 @@ afr_self_heal_type_for_transaction (afr_transaction_type type) } return sh_type; } + +int +afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +{ + int ret = -1; + uuid_t pargfid = {0}; + + if (!child) + goto out; + + if (!uuid_is_null (parent->inode->gfid)) + uuid_copy (pargfid, parent->inode->gfid); + else if (!uuid_is_null (parent->gfid)) + uuid_copy (pargfid, parent->gfid); + + if (uuid_is_null (pargfid)) + goto out; + + if (strcmp (parent->path, "/") == 0) + ret = gf_asprintf ((char **)&child->path, "/%s", name); + else + ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path, + name); + + if (-1 == ret) { + gf_log (this->name, GF_LOG_ERROR, + "asprintf failed while setting child path"); + } + + child->name = strrchr (child->path, '/'); + if (child->name) + child->name++; + + child->parent = inode_ref (parent->inode); + child->inode = inode_new (parent->inode->table); + uuid_copy (child->pargfid, pargfid); + + if (!child->inode) { + ret = -1; + goto out; + } + + ret = 0; +out: + if ((ret == -1) && child) + loc_wipe (child); + + return ret; +} + +int +afr_sh_erase_pending (call_frame_t *frame, xlator_t *this, + afr_transaction_type type, afr_fxattrop_cbk_t cbk, + int (*finish)(call_frame_t *frame, xlator_t *this)) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t **erase_xattr = NULL; + int ret = -1; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, + sh->success, priv->child_count, type); + + erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, + gf_afr_mt_dict_t); + if (!erase_xattr) + goto out; + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) { + call_count++; + erase_xattr[i] = dict_new (); + if (!erase_xattr[i]) + goto out; + } + } + + afr_sh_delta_to_xattr (this, sh->delta_matrix, erase_xattr, + priv->child_count, type); + + gf_log (this->name, GF_LOG_DEBUG, "Delta matrix for: %s", + lkowner_utoa (&frame->root->lk_owner)); + afr_sh_print_pending_matrix (sh->delta_matrix, this); + local->call_count = call_count; + if (call_count == 0) { + ret = 0; + finish (frame, this); + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (!erase_xattr[i]) + continue; + + if (sh->healing_fd) {//true for ENTRY, reg file DATA transaction + STACK_WIND_COOKIE (frame, cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + sh->healing_fd, + GF_XATTROP_ADD_ARRAY, erase_xattr[i], + NULL); + } else { + STACK_WIND_COOKIE (frame, cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, erase_xattr[i], + NULL); + } + } + + ret = 0; +out: + if (erase_xattr) { + for (i = 0; i < priv->child_count; i++) { + if (erase_xattr[i]) { + dict_unref (erase_xattr[i]); + } + } + } + + GF_FREE (erase_xattr); + + if (ret < 0) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + finish (frame, this); + } + + return 0; +} + +void +afr_set_self_heal_status(afr_self_heal_t *sh, afr_self_heal_status status) +{ + xlator_t *this = NULL; + afr_sh_status_for_all_type *sh_status = &(sh->afr_all_sh_status); + afr_self_heal_type sh_type_in_action = sh->sh_type_in_action; + this = THIS; + + if (!sh) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal" + "Structure"); + goto out; + } + + switch (sh_type_in_action) { + case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: + sh_status->gfid_or_missing_entry_self_heal = status; + break; + case AFR_SELF_HEAL_METADATA: + sh_status->metadata_self_heal = status; + break; + case AFR_SELF_HEAL_DATA: + sh_status->data_self_heal = status; + break; + case AFR_SELF_HEAL_ENTRY: + sh_status->entry_self_heal = status; + break; + case AFR_SELF_HEAL_INVALID: + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid" + "self heal type in action"); + break; + } +out: + return; +} + +void +afr_set_local_for_unhealable (afr_local_t *local) +{ + afr_self_heal_t *sh = NULL; + + sh = &local->self_heal; + + local->unhealable = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +} + +int +is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type) +{ + afr_sh_status_for_all_type sh_status = sh->afr_all_sh_status; + afr_self_heal_type sh_type_in_action = AFR_SELF_HEAL_INVALID; + afr_self_heal_status status = AFR_SELF_HEAL_FAILED; + xlator_t *this = NULL; + int sh_failed = 0; + + this = THIS; + + if (!sh) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal " + "structure"); + sh_failed = 1; + goto out; + } + + if (type == AFR_CHECK_ALL) { + if ((sh_status.gfid_or_missing_entry_self_heal == AFR_SELF_HEAL_FAILED) + || (sh_status.metadata_self_heal == AFR_SELF_HEAL_FAILED) + || (sh_status.data_self_heal == AFR_SELF_HEAL_FAILED) + || (sh_status.entry_self_heal == AFR_SELF_HEAL_FAILED)) + sh_failed = 1; + } else if (type == AFR_CHECK_SPECIFIC) { + sh_type_in_action = sh->sh_type_in_action; + switch (sh_type_in_action) { + case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: + status = sh_status.gfid_or_missing_entry_self_heal; + break; + case AFR_SELF_HEAL_METADATA: + status = sh_status.metadata_self_heal; + break; + case AFR_SELF_HEAL_ENTRY: + status = sh_status.entry_self_heal; + break; + case AFR_SELF_HEAL_DATA: + status = sh_status.data_self_heal; + break; + case AFR_SELF_HEAL_INVALID: + status = AFR_SELF_HEAL_NOT_ATTEMPTED; + break; + } + if (status == AFR_SELF_HEAL_FAILED) + sh_failed = 1; + + } + +out: + return sh_failed; +} + +char * +get_sh_completion_status (afr_self_heal_status status) +{ + + char *not_attempted = " is not attempted"; + char *failed = " failed"; + char *started = " is started"; + char *sync_begin = " is successfully completed"; + char *result = " has unknown status"; + + switch (status) + { + case AFR_SELF_HEAL_NOT_ATTEMPTED: + result = not_attempted; + break; + case AFR_SELF_HEAL_FAILED: + result = failed; + break; + case AFR_SELF_HEAL_STARTED: + result = started; + break; + case AFR_SELF_HEAL_SYNC_BEGIN: + result = sync_begin; + break; + } + + return result; + +} + +void +afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t loglvl) +{ + + char sh_log[4096] = {0}; + afr_self_heal_t *sh = &local->self_heal; + afr_sh_status_for_all_type all_status = sh->afr_all_sh_status; + xlator_t *this = NULL; + size_t off = 0; + int data_sh = 0; + int metadata_sh = 0; + int print_log = 0; + + this = THIS; + + ADD_FMT_STRING (sh_log, off, "gfid or missing entry", + all_status.gfid_or_missing_entry_self_heal, print_log); + ADD_FMT_STRING_SYNC (sh_log, off, "metadata", + all_status.metadata_self_heal, print_log); + if (sh->background) { + ADD_FMT_STRING_SYNC (sh_log, off, "backgroung data", + all_status.data_self_heal, print_log); + } else { + ADD_FMT_STRING_SYNC (sh_log, off, "foreground data", + all_status.data_self_heal, print_log); + } + ADD_FMT_STRING_SYNC (sh_log, off, "entry", all_status.entry_self_heal, + print_log); + + if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.data_self_heal && + strcmp (sh->data_sh_info, "") && sh->data_sh_info ) + data_sh = 1; + if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.metadata_self_heal && + strcmp (sh->metadata_sh_info, "") && sh->metadata_sh_info) + metadata_sh = 1; + + if (!print_log) + return; + + gf_log (this->name, loglvl, "%s %s %s on %s", sh_log, + ((data_sh == 1) ? sh->data_sh_info : ""), + ((metadata_sh == 1) ? sh->metadata_sh_info : ""), + local->loc.path); +} diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h index 3df5f0a0a..473264776 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ b/xlators/cluster/afr/src/afr-self-heal-common.h @@ -1,53 +1,40 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __AFR_SELF_HEAL_COMMON_H__ #define __AFR_SELF_HEAL_COMMON_H__ #define FILE_HAS_HOLES(buf) (((buf)->ia_size) > ((buf)->ia_blocks * 512)) +#define AFR_SH_MIN_PARTICIPANTS 2 typedef enum { - AFR_SELF_HEAL_ENTRY, - AFR_SELF_HEAL_METADATA, - AFR_SELF_HEAL_DATA, - AFR_SELF_HEAL_INVALID = -1, -} afr_self_heal_type; + AFR_LOOKUP_FAIL_CONFLICTS = 1, + AFR_LOOKUP_FAIL_MISSING_GFIDS = 2, +} afr_lookup_flags_t; -typedef int -(*afr_lookup_cbk_t) (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent); int afr_sh_select_source (int sources[], int child_count); int -afr_sh_sink_count (int sources[], int child_count); - -int afr_sh_source_count (int sources[], int child_count); void afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this); +void +afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, + const char *loc); + int afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, + unsigned char *ignorant_subvols, dict_t *xattr[], afr_transaction_type type, size_t child_count); @@ -57,18 +44,15 @@ afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, int child_count, afr_transaction_type type); int -afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, - int32_t child_count, afr_self_heal_type type, - int32_t *valid_children, const char *xlator_name); +afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, + struct iatt *bufs, afr_self_heal_type type, + int32_t *success_children, int32_t *subvol_status); int -afr_sh_delta_to_xattr (afr_private_t *priv, +afr_sh_delta_to_xattr (xlator_t *this, int32_t *delta_matrix[], dict_t *xattr[], int child_count, afr_transaction_type type); -int -afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count); - void afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, size_t size); @@ -77,9 +61,10 @@ afr_self_heal_type afr_self_heal_type_for_transaction (afr_transaction_type type); int -afr_build_sources (xlator_t *xlator, dict_t **xattr, struct iatt *bufs, +afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, int32_t **pending_matrix, int32_t *sources, - int32_t *success_children, afr_transaction_type type); + int32_t *success_children, afr_transaction_type type, + int32_t *subvol_status, gf_boolean_t ignore_ignorant); void afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count); @@ -93,25 +78,26 @@ afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, int afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - afr_lookup_cbk_t lookup_cbk, gf_boolean_t set_gfid); + afr_lookup_done_cbk_t lookup_cbk, uuid_t uuid, + int32_t flags, dict_t *xdata); int afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, - int active_src, struct iatt *buf); + int active_src, struct iatt *buf, + struct iatt *parentbuf); int afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, char *base_name, afr_lock_cbk_t lock_cbk); int afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *buf, - struct iatt *postparent); + int child_index); int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, afr_lock_cbk_t lock_cbk); afr_local_t * -afr_local_copy (afr_local_t *l, xlator_t *this); +afr_self_heal_local_init (afr_local_t *l, xlator_t *this); int afr_sh_data_lock (call_frame_t *frame, xlator_t *this, - off_t start, off_t len, + off_t start, off_t len, gf_boolean_t block, char *dom, afr_lock_cbk_t success_handler, afr_lock_cbk_t failure_handler); void @@ -121,5 +107,38 @@ afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this); typedef int (*afr_fxattrop_cbk_t) (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr); + dict_t *xattr, dict_t *xdata); +int +afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name); +int +afr_impunge_frame_create (call_frame_t *frame, xlator_t *this, + int active_source, call_frame_t **impunge_frame); +void +afr_sh_reset (call_frame_t *frame, xlator_t *this); + +void +afr_children_intersection_get (int32_t *set1, int32_t *set2, + int *intersection, unsigned int child_count); +int +afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children, + struct iatt *bufs); +int +afr_sh_erase_pending (call_frame_t *frame, xlator_t *this, + afr_transaction_type type, afr_fxattrop_cbk_t cbk, + int (*finish)(call_frame_t *frame, xlator_t *this)); + +void +afr_set_local_for_unhealable (afr_local_t *local); + +int +is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type); + +void +afr_set_self_heal_status (afr_self_heal_t *sh, afr_self_heal_status status); + +void +afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t logl); + +char* +afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this); #endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 216017cbb..9de26ee56 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -49,6 +40,14 @@ #include "afr-self-heal-common.h" #include "afr-self-heal-algorithm.h" +int +afr_sh_data_fail (call_frame_t *frame, xlator_t *this); + +static inline gf_boolean_t +afr_sh_data_proceed (unsigned int success_count) +{ + return (success_count >= AFR_SH_MIN_PARTICIPANTS); +} extern int sh_loop_finish (call_frame_t *loop_frame, xlator_t *this); @@ -63,15 +62,6 @@ int afr_sh_data_finish (call_frame_t *frame, xlator_t *this); int -afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this, - afr_fxattrop_cbk_t fxattrop_cbk); - -int -afr_post_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr); - -int afr_sh_data_done (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; @@ -88,7 +78,7 @@ afr_sh_data_done (call_frame_t *frame, xlator_t *this) int afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; afr_private_t *priv = NULL; @@ -101,7 +91,7 @@ afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, LOCK (&frame->lock); { if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_ERROR, "flush failed on %s on subvolume %s: %s", local->loc.path, priv->children[child_index]->name, strerror (op_errno)); @@ -131,6 +121,11 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; priv = this->private; + if (!sh->healing_fd) { + //This happens when file is non-reg + afr_sh_data_done (frame, this); + return 0; + } call_count = afr_set_elem_count_get (sh->success, priv->child_count); local->call_count = call_count; @@ -151,7 +146,7 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->flush, - sh->healing_fd); + sh->healing_fd, NULL); if (!--call_count) break; @@ -161,9 +156,28 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this) } int +afr_sh_dom_unlock (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + if (sh->sh_dom_lock_held) + afr_sh_data_unlock (frame, this, priv->sh_domain, + afr_sh_data_close); + else + afr_sh_data_close (frame, this); + return 0; +} + +int afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *statpre, - struct iatt *statpost) + struct iatt *statpost, dict_t *xdata) { afr_local_t *local = NULL; @@ -195,29 +209,20 @@ afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } int -afr_sh_data_setattr (call_frame_t *frame, xlator_t *this) +afr_sh_data_setattr (call_frame_t *frame, xlator_t *this, struct iatt* stbuf) { afr_local_t *local = NULL; afr_private_t *priv = NULL; afr_self_heal_t *sh = NULL; int i = 0; int call_count = 0; - int source = 0; int32_t valid = 0; - struct iatt stbuf = {0,}; local = frame->local; sh = &local->self_heal; priv = this->private; - source = sh->source; - - valid |= (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME); - - stbuf.ia_atime = sh->buf[source].ia_atime; - stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec; - stbuf.ia_mtime = sh->buf[source].ia_mtime; - stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec; + valid = (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME); call_count = afr_set_elem_count_get (sh->success, priv->child_count); @@ -237,7 +242,7 @@ afr_sh_data_setattr (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->setattr, - &local->loc, &stbuf, valid); + &local->loc, stbuf, valid, NULL); if (!--call_count) break; @@ -249,7 +254,7 @@ afr_sh_data_setattr (call_frame_t *frame, xlator_t *this) int afr_sh_data_setattr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf) + struct iatt *buf, dict_t *xdata) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; @@ -259,9 +264,14 @@ afr_sh_data_setattr_fstat_cbk (call_frame_t *frame, void *cookie, sh = &local->self_heal; GF_ASSERT (sh->source == child_index); - if (op_ret != -1) + if (op_ret != -1) { sh->buf[child_index] = *buf; - afr_sh_data_setattr (frame, this); + afr_sh_data_setattr (frame, this, buf); + } else { + gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " + "time-stamps after self-heal", local->loc.path); + afr_sh_data_fail (frame, this); + } return 0; } @@ -286,37 +296,51 @@ afr_sh_set_timestamps (call_frame_t *frame, xlator_t *this) (void *) (long) sh->source, priv->children[sh->source], priv->children[sh->source]->fops->fstat, - sh->healing_fd); + sh->healing_fd, NULL); return 0; } //Fun fact, lock_cbk is being used for both lock & unlock int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, afr_lock_cbk_t lock_cbk) { afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int ret = 0; local = frame->local; int_lock = &local->internal_lock; sh = &local->self_heal; + priv = this->private; - GF_ASSERT (sh->data_lock_held); - - sh->data_lock_held = _gf_false; + if (strcmp (dom, this->name) == 0) { + sh->data_lock_held = _gf_false; + } else if (strcmp (dom, priv->sh_domain) == 0) { + sh->sh_dom_lock_held = _gf_false; + } else { + ret = -1; + goto out; + } int_lock->lock_cbk = lock_cbk; + int_lock->domain = dom; afr_unlock (frame, this); +out: + if (ret) { + int_lock->lock_op_ret = -1; + int_lock->lock_cbk (frame, this); + } return 0; } int afr_sh_data_finish (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; local = frame->local; sh = &local->self_heal; @@ -325,9 +349,9 @@ afr_sh_data_finish (call_frame_t *frame, xlator_t *this) "finishing data selfheal of %s", local->loc.path); if (sh->data_lock_held) - afr_sh_data_unlock (frame, this, afr_sh_data_close); + afr_sh_data_unlock (frame, this, this->name, afr_sh_dom_unlock); else - afr_sh_data_close (frame, this); + afr_sh_dom_unlock (frame, this); return 0; } @@ -344,36 +368,49 @@ afr_sh_data_fail (call_frame_t *frame, xlator_t *this) gf_log (this->name, GF_LOG_DEBUG, "finishing failed data selfheal of %s", local->loc.path); - sh->op_failed = 1; - if (sh->data_lock_held) - afr_sh_data_unlock (frame, this, afr_sh_data_close); - else - afr_sh_data_close (frame, this); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_data_finish (frame, this); return 0; } int afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) + int32_t op_errno, dict_t *xattr, dict_t *xdata) { int call_count = 0; afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int32_t child_index = (long) cookie; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Erasing of pending change " + "log failed on %s for subvol %s, reason: %s", + local->loc.path, priv->children[child_index]->name, + strerror (op_errno)); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + } call_count = afr_frame_return (frame); if (call_count == 0) { - local = frame->local; - sh = &local->self_heal; - if (NULL == sh->old_loop_frame) { - GF_ASSERT (sh->data_lock_held); - afr_sh_data_fxattrop (frame, this, - afr_post_sh_data_fxattrop_cbk); + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { + if (sh->old_loop_frame) + sh_loop_finish (sh->old_loop_frame, this); + sh->old_loop_frame = NULL; + afr_sh_data_fail (frame, this); goto out; } - - afr_sh_data_lock (frame, this, 0, 0, + if (!IA_ISREG (sh->type)) { + afr_sh_data_finish (frame, this); + goto out; + } + GF_ASSERT (sh->old_loop_frame); + afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, afr_post_sh_big_lock_success, afr_post_sh_big_lock_failure); } @@ -384,74 +421,95 @@ out: int afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; + afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION, + afr_sh_data_erase_pending_cbk, + afr_sh_data_finish); + return 0; +} + +int +afr_sh_data_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = (long) cookie; local = frame->local; - sh = &local->self_heal; priv = this->private; + sh = &local->self_heal; + + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "%s: Failed to fsync on " + "%s - %s", local->loc.path, + priv->children[child_index]->name, strerror (op_errno)); + LOCK (&frame->lock); + { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + } + UNLOCK (&frame->lock); + if (sh->old_loop_frame) + sh_loop_finish (sh->old_loop_frame, this); + sh->old_loop_frame = NULL; + } - afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, sh->success, - priv->child_count, AFR_DATA_TRANSACTION); - gf_log (this->name, GF_LOG_DEBUG, "Delta matrix for: %"PRIu64, - frame->root->lk_owner); - afr_sh_print_pending_matrix (sh->delta_matrix, this); + call_count = afr_frame_return (frame); + if (call_count == 0) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) + afr_sh_data_fail (frame, this); + else + afr_sh_data_erase_pending (frame, this); + } + return 0; +} - erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, - gf_afr_mt_dict_t); +/* + * Before erasing xattrs, make sure the data is written to disk + */ +int +afr_sh_data_fsync (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + int i = 0; + int call_count = 0; - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; + local = frame->local; + priv = this->private; + sh = &local->self_heal; - erase_xattr[i] = get_new_dict(); - dict_ref (erase_xattr[i]); - } + call_count = sh->active_sinks; + if (call_count == 0) { + afr_sh_data_erase_pending (frame, this); + return 0; } - afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, - priv->child_count, AFR_DATA_TRANSACTION); - - GF_ASSERT (call_count); local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) + if (!sh->success[i] || sh->sources[i]) continue; - gf_log (this->name, GF_LOG_DEBUG, - "erasing pending flags from %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_erase_pending_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - sh->healing_fd, - GF_XATTROP_ADD_ARRAY, erase_xattr[i]); - if (!--call_count) - break; - } - - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); - } + STACK_WIND_COOKIE (frame, afr_sh_data_fsync_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->fsync, + sh->healing_fd, 1, NULL); } - GF_FREE (erase_xattr); return 0; } - static struct afr_sh_algorithm * sh_algo_from_name (xlator_t *this, char *name) { int i = 0; + if (name == NULL) + goto out; + while (afr_self_heal_algorithms[i].name) { if (!strcmp (name, afr_self_heal_algorithms[i].name)) { return &afr_self_heal_algorithms[i]; @@ -460,17 +518,22 @@ sh_algo_from_name (xlator_t *this, char *name) i++; } +out: return NULL; } static int -sh_zero_byte_files_exist (afr_self_heal_t *sh, int child_count) +sh_zero_byte_files_exist (afr_local_t *local, int child_count) { - int i; - int ret = 0; + int i = 0; + int ret = 0; + afr_self_heal_t *sh = NULL; + sh = &local->self_heal; for (i = 0; i < child_count; i++) { + if (!local->child_up[i] || sh->child_errno[i]) + continue; if (sh->buf[i].ia_size == 0) { ret = 1; break; @@ -497,8 +560,7 @@ afr_sh_data_pick_algo (call_frame_t *frame, xlator_t *this) if (algo == NULL) { /* option not set, so fall back on heuristics */ - if ((local->enoent_count != 0) - || sh_zero_byte_files_exist (sh, priv->child_count) + if (sh_zero_byte_files_exist (local, priv->child_count) || (sh->file_size <= (priv->data_self_heal_window_size * this->ctx->page_size))) { @@ -536,11 +598,12 @@ afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this) local = frame->local; sh = &local->self_heal; - sh->algo_completion_cbk = afr_sh_data_erase_pending; + sh->algo_completion_cbk = afr_sh_data_fsync; sh->algo_abort_cbk = afr_sh_data_fail; sh_algo = afr_sh_data_pick_algo (frame, this); + sh->algo = sh_algo; sh_algo->fn (frame, this); return 0; @@ -549,38 +612,46 @@ afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this) int afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; int call_count = 0; int child_index = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; - priv = this->private; + priv = this->private; local = frame->local; + sh = &local->self_heal; child_index = (long) cookie; LOCK (&frame->lock); { - if (op_ret == -1) - gf_log (this->name, GF_LOG_INFO, + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "ftruncate of %s on subvolume %s failed (%s)", local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - else + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + } else { gf_log (this->name, GF_LOG_DEBUG, "ftruncate of %s on subvolume %s completed", local->loc.path, priv->children[child_index]->name); + } } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); - if (call_count == 0) - afr_sh_data_sync_prepare (frame, this); + if (call_count == 0) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) + afr_sh_data_fail (frame, this); + else + afr_sh_data_sync_prepare (frame, this); + } return 0; } @@ -614,7 +685,8 @@ afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->ftruncate, - sh->healing_fd, sh->file_size); + sh->healing_fd, sh->file_size, + NULL); if (!--call_count) break; @@ -628,6 +700,7 @@ afr_sh_inode_set_read_ctx (afr_self_heal_t *sh, xlator_t *this) { afr_private_t *priv = NULL; int ret = 0; + int i = 0; priv = this->private; sh->source = afr_sh_select_source (sh->sources, priv->child_count); @@ -636,6 +709,15 @@ afr_sh_inode_set_read_ctx (afr_self_heal_t *sh, xlator_t *this) goto out; } + /* detect changes not visible through pending flags -- JIC */ + for (i = 0; i < priv->child_count; i++) { + if (i == sh->source || sh->child_errno[i]) + continue; + + if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[sh->source])) + sh->sources[i] = 0; + } + afr_reset_children (sh->fresh_children, priv->child_count); afr_get_fresh_children (sh->success_children, sh->sources, sh->fresh_children, priv->child_count); @@ -645,72 +727,211 @@ out: return ret; } -int -afr_sh_data_fix (call_frame_t *frame, xlator_t *this) +char* +afr_get_sizes_str (afr_local_t *local, struct iatt *bufs, xlator_t *this) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int nsources = 0; - int source = 0; - int i = 0; - int ret = 0; + afr_private_t *priv = NULL; + int i = 0; + char num[1024] = {0}; + size_t len = 0; + char *sizes_str = NULL; + size_t off = 0; + char *fmt_str = "%llu bytes on %s, "; + char *child_down = " %s,"; + char *child_unknown = " %s,"; + int down_child_present = 0; + int down_count = 0; + int unknown_count = 0; + int unknown_child_present = 0; + char *down_subvol_1 = " down subvolume is "; + char *unknown_subvol_1 = " unknown subvolume is "; + char *down_subvol_2 = " down subvolumes are "; + char *unknown_subvol_2 = " unknown subvolumes are "; - local = frame->local; - sh = &local->self_heal; priv = this->private; - gf_log (this->name, GF_LOG_DEBUG, "Pending matrix for: %"PRIu64, - frame->root->lk_owner); - nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, - sh->sources, sh->success_children, - AFR_DATA_TRANSACTION); - if (nsources == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "No self-heal needed for %s", - local->loc.path); + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 1) { + len += snprintf (num, sizeof (num), fmt_str, + (unsigned long long) bufs[i].ia_size, + priv->children[i]->name); + } else if (local->child_up[i] == 0) { + len += snprintf (num, sizeof (num), child_down, + priv->children[i]->name); + if (!down_child_present) + down_child_present = 1; + down_count ++; + } else if (local->child_up[i] == -1) { + len += snprintf (num, sizeof (num), child_unknown, + priv->children[i]->name); + if (!unknown_child_present) + unknown_child_present = 1; + unknown_count++; + } - afr_sh_data_finish (frame, this); - return 0; } - if ((nsources == -1) - && (priv->favorite_child != -1) - && (sh->child_errno[priv->favorite_child] == 0)) { + if (down_child_present) { + if (down_count > 1) + len += snprintf (num, sizeof (num), "%s", + down_subvol_2); + else + len += snprintf (num, sizeof (num), "%s", + down_subvol_1); + } + if (unknown_child_present) { + if (unknown_count > 1) + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_2); + else + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_1); + } - gf_log (this->name, GF_LOG_DEBUG, - "Picking favorite child %s as authentic source to " - "resolve conflicting data of %s", - priv->children[priv->favorite_child]->name, - local->loc.path); + len++;//for '\0' - sh->sources[priv->favorite_child] = 1; + sizes_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); - nsources = afr_sh_source_count (sh->sources, - priv->child_count); + if (!sizes_str) + return NULL; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 1) { + off += snprintf (sizes_str + off, len - off, fmt_str, + (unsigned long long) bufs[i].ia_size, + priv->children[i]->name); + } } - if (nsources == -1) { - gf_log (this->name, GF_LOG_ERROR, - "Unable to self-heal contents of '%s' (possible " - "split-brain). Please delete the file from all but " - "the preferred subvolume.", local->loc.path); + if (down_child_present) { + if (down_count > 1) { + off += snprintf (sizes_str + off, len - off, "%s", + down_subvol_2); + } else { + off += snprintf (sizes_str + off, len - off, "%s", + down_subvol_1); + } + } - local->govinda_gOvinda = 1; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 0) { + off += snprintf (sizes_str + off, len - off, child_down, + priv->children[i]->name); + } + } - afr_sh_data_fail (frame, this); - return 0; + if (unknown_child_present) { + if (unknown_count > 1) { + off += snprintf (sizes_str + off, len - off, "%s", + unknown_subvol_2); + } else { + off += snprintf (sizes_str + off, len - off, "%s", + unknown_subvol_1); + } } - ret = afr_sh_inode_set_read_ctx (sh, this); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "No active sources found."); + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == -1) { + off += snprintf (sizes_str + off, len - off, + child_unknown, + priv->children[i]->name); - afr_sh_data_fail (frame, this); - return 0; + } + } + + return sizes_str; +} + +char* +afr_get_sinks_str (xlator_t *this, afr_local_t *local, afr_self_heal_t *sh) +{ + afr_private_t *priv = NULL; + int i = 0; + char num[1024] = {0}; + size_t len = 0; + char *sinks_str = NULL; + char *temp_str = " to sinks "; + char *str_format = " %s,"; + char off = 0; + + priv = this->private; + + len += snprintf (num, sizeof (num), "%s", temp_str); + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { + len += snprintf (num, sizeof (num), str_format, + priv->children[i]->name); + } + } + + len ++; + + sinks_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); + + if (!sinks_str) + return NULL; + + off += snprintf (sinks_str + off, len - off, "%s", temp_str); + + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { + off += snprintf (sinks_str + off, len - off, + str_format, + priv->children[i]->name); + } } + return sinks_str; + +} + + +void +afr_set_data_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, xlator_t *this) +{ + char *pending_matrix_str = NULL; + char *sizes_str = NULL; + char *sinks_str = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + + pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix, + this); + if (!pending_matrix_str) + pending_matrix_str = ""; + + sizes_str = afr_get_sizes_str (local, sh->buf, this); + if (!sizes_str) + sizes_str = ""; + + sinks_str = afr_get_sinks_str (this, local, sh); + if (!sinks_str) + sinks_str = ""; + + gf_asprintf (&sh->data_sh_info, " data self heal from %s %s with " + "%s data %s", priv->children[sh->source]->name, sinks_str, + sizes_str, pending_matrix_str); + + if (pending_matrix_str && strcmp (pending_matrix_str, "")) + GF_FREE (pending_matrix_str); + + if (sizes_str && strcmp (sizes_str, "")) + GF_FREE (sizes_str); +} + +void +afr_sh_data_fix (call_frame_t *frame, xlator_t *this) +{ + int source = 0; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + source = sh->source; sh->block_size = this->ctx->page_size; sh->file_size = sh->buf[source].ia_size; @@ -718,17 +939,9 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this) if (FILE_HAS_HOLES (&sh->buf[source])) sh->file_has_holes = 1; - /* detect changes not visible through pending flags -- JIC */ - for (i = 0; i < priv->child_count; i++) { - if (i == source || sh->child_errno[i]) - continue; - - if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[source])) - sh->sources[i] = 0; - } - - if (sh->background && sh->unwind) { - sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno); + if (sh->background && sh->unwind && !sh->unwound) { + sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, + is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)); sh->unwound = _gf_true; } @@ -738,70 +951,123 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this) "no active sinks for performing self-heal on file %s", local->loc.path); afr_sh_data_finish (frame, this); - return 0; + return; } gf_log (this->name, GF_LOG_DEBUG, "self-healing file %s from subvolume %s to %d other", local->loc.path, priv->children[sh->source]->name, sh->active_sinks); - afr_sh_data_trim_sinks (frame, this); - return 0; + sh->actual_sh_started = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); + afr_sh_data_trim_sinks (frame, this); } -static void -afr_destroy_pending_matrix (int32_t **pending_matrix, int32_t child_count) +int +afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this) { + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int nsources = 0; + int ret = 0; + int *old_sources = NULL; + int tstamp_source = 0; int i = 0; - GF_ASSERT (child_count > 0); - if (pending_matrix) { - for (i = 0; i < child_count; i++) { - if (pending_matrix[i]) - GF_FREE (pending_matrix[i]); - } - GF_FREE (pending_matrix); + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + gf_log (this->name, GF_LOG_DEBUG, "Pending matrix for: %s", + lkowner_utoa (&frame->root->lk_owner)); + if (sh->sync_done) { + //store sources before sync so that mtime can be set using the + //iatt buf from one of them. + old_sources = alloca (priv->child_count*sizeof (*old_sources)); + memcpy (old_sources, sh->sources, + priv->child_count * sizeof (*old_sources)); } -} -static int32_t** -afr_create_pending_matrix (int32_t child_count) -{ - gf_boolean_t cleanup = _gf_false; - int32_t **pending_matrix = NULL; - int i = 0; + nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, + sh->sources, sh->success_children, + AFR_DATA_TRANSACTION, NULL, _gf_true); + if ((nsources == -1) + && (priv->favorite_child != -1) + && (sh->child_errno[priv->favorite_child] == 0)) { - GF_ASSERT (child_count > 0); + gf_log (this->name, GF_LOG_DEBUG, + "Picking favorite child %s as authentic source to " + "resolve conflicting data of %s", + priv->children[priv->favorite_child]->name, + local->loc.path); - pending_matrix = GF_CALLOC (sizeof (*pending_matrix), child_count, - gf_afr_mt_int32_t); - if (NULL == pending_matrix) - goto out; - for (i = 0; i < child_count; i++) { - pending_matrix[i] = GF_CALLOC (sizeof (**pending_matrix), - child_count, - gf_afr_mt_int32_t); - if (NULL == pending_matrix[i]) { - cleanup = _gf_true; - goto out; - } + sh->sources[priv->favorite_child] = 1; + + nsources = afr_sh_source_count (sh->sources, + priv->child_count); } -out: - if (_gf_true == cleanup) { - afr_destroy_pending_matrix (pending_matrix, child_count); - pending_matrix = NULL; + + if (nsources == -1) { + afr_sh_print_split_brain_log (sh->pending_matrix, this, + local->loc.path); + afr_set_split_brain (this, sh->inode, DONT_KNOW, SPB); + + afr_sh_data_fail (frame, this); + return 0; } - return pending_matrix; + + afr_set_split_brain (this, sh->inode, DONT_KNOW, NO_SPB); + + ret = afr_sh_inode_set_read_ctx (sh, this); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "No active sources found."); + + afr_sh_data_fail (frame, this); + return 0; + } + + if (sh->sync_done) { + /* Perform setattr from one of the old_sources if possible + * Because only they have the correct mtime, the new sources + * (i.e. old sinks) have mtime from last writev in sync. + */ + tstamp_source = sh->source; + for (i = 0; i < priv->child_count; i++) { + if (old_sources[i] && sh->sources[i]) + tstamp_source = i; + } + afr_sh_data_setattr (frame, this, &sh->buf[tstamp_source]); + } else { + afr_set_data_sh_info_str (local, sh, this); + if (nsources == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "No self-heal needed for %s", + local->loc.path); + + afr_sh_data_finish (frame, this); + return 0; + } + + if (sh->do_data_self_heal && + afr_data_self_heal_enabled (priv->data_self_heal)) + afr_sh_data_fix (frame, this); + else + afr_sh_data_finish (frame, this); + } + return 0; } int afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, dict_t **xattr, - afr_transaction_type txn_type) + afr_transaction_type txn_type, + uuid_t gfid) { afr_private_t *priv = NULL; int read_child = -1; - int ret = -1; int32_t **pending_matrix = NULL; int32_t *sources = NULL; int32_t *success_children = NULL; @@ -809,26 +1075,41 @@ afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, int32_t nsources = 0; int32_t prev_read_child = -1; int32_t config_read_child = -1; + int32_t subvol_status = 0; priv = this->private; bufs = local->cont.lookup.bufs; success_children = local->cont.lookup.success_children; - pending_matrix = afr_create_pending_matrix (priv->child_count); - if (NULL == pending_matrix) - goto out; - - sources = GF_CALLOC (sizeof (*sources), priv->child_count, - gf_afr_mt_int32_t); - if (NULL == sources) - goto out; + pending_matrix = local->cont.lookup.pending_matrix; + sources = local->cont.lookup.sources; + memset (sources, 0, sizeof (*sources) * priv->child_count); nsources = afr_build_sources (this, xattr, bufs, pending_matrix, - sources, success_children, txn_type); - if (nsources < 0) { - ret = -1; - goto out; + sources, success_children, txn_type, + &subvol_status, _gf_false); + if (subvol_status & SPLIT_BRAIN) { + gf_log (this->name, GF_LOG_DEBUG, "%s: Possible split-brain", + local->loc.path); + switch (txn_type) { + case AFR_DATA_TRANSACTION: + local->cont.lookup.possible_spb = _gf_true; + nsources = 1; + sources[success_children[0]] = 1; + break; + case AFR_ENTRY_TRANSACTION: + read_child = afr_get_no_xattr_dir_read_child (this, + success_children, + bufs); + sources[read_child] = 1; + nsources = 1; + break; + default: + break; + } } + if (nsources < 0) + goto out; prev_read_child = local->read_child_index; config_read_child = priv->read_child; @@ -836,23 +1117,18 @@ afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, priv->child_count, prev_read_child, config_read_child, - sources); - ret = 0; - local->cont.lookup.sources = sources; + sources, + priv->hash_mode, gfid); out: - afr_destroy_pending_matrix (pending_matrix, priv->child_count); - if (-1 == ret) { - if (sources) - GF_FREE (sources); - } - gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d", read_child); + gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d", + read_child); return read_child; } int afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf) + struct iatt *buf, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -875,6 +1151,12 @@ afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie, sh->buf[child_index] = *buf; sh->success_children[sh->success_count] = child_index; sh->success_count++; + } else { + gf_log (this->name, GF_LOG_ERROR, "%s: fstat failed " + "on %s, reason %s", local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + sh->child_errno[child_index] = op_errno; } } UNLOCK (&frame->lock); @@ -882,9 +1164,20 @@ afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie, call_count = afr_frame_return (frame); if (call_count == 0) { - afr_sh_data_fix (frame, this); + /* Previous versions of glusterfs might have set + * the pending data xattrs which need to be erased + */ + if (!afr_sh_data_proceed (sh->success_count)) { + gf_log (this->name, GF_LOG_ERROR, "inspecting metadata " + "succeeded on < %d children, aborting " + "self-heal for %s", AFR_SH_MIN_PARTICIPANTS, + local->loc.path); + afr_sh_data_fail (frame, this); + goto out; + } + afr_sh_data_fxattrop_fstat_done (frame, this); } - +out: return 0; } @@ -895,33 +1188,41 @@ afr_sh_data_fstat (call_frame_t *frame, xlator_t *this) afr_self_heal_t *sh = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; + int call_count = 0; + int i = 0; + int child = 0; + int32_t *fstat_children = NULL; priv = this->private; local = frame->local; sh = &local->self_heal; - call_count = afr_up_children_count (local->child_up, - priv->child_count); - + fstat_children = memdup (sh->success_children, + sizeof (*fstat_children) * priv->child_count); + if (!fstat_children) { + afr_sh_data_fail (frame, this); + goto out; + } + call_count = sh->success_count; local->call_count = call_count; + memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count); afr_reset_children (sh->success_children, priv->child_count); sh->success_count = 0; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fstat, - sh->healing_fd); - - if (!--call_count) - break; - } + child = fstat_children[i]; + if (child == -1) + break; + STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk, + (void *) (long) child, + priv->children[child], + priv->children[child]->fops->fstat, + sh->healing_fd, NULL); + --call_count; } - + GF_ASSERT (!call_count); +out: + GF_FREE (fstat_children); return 0; } @@ -950,73 +1251,60 @@ afr_sh_common_fxattrop_resp_handler (call_frame_t *frame, void *cookie, sh->xattr[child_index] = dict_ref (xattr); sh->success_children[sh->success_count] = child_index; sh->success_count++; + } else { + gf_log (this->name, GF_LOG_ERROR, "fxattrop of %s " + "failed on %s, reason %s", local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + sh->child_errno[child_index] = op_errno; } } UNLOCK (&frame->lock); } int -afr_post_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) +afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xattr, dict_t *xdata) { int call_count = -1; - int ret = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - afr_sh_common_fxattrop_resp_handler (frame, cookie, this, op_ret, - op_errno, xattr); + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; local = frame->local; - sh = &local->self_heal; - call_count = afr_frame_return (frame); - if (call_count == 0) { - (void) afr_build_sources (this, sh->xattr, NULL, - sh->pending_matrix, - sh->sources, sh->success_children, - AFR_DATA_TRANSACTION); - ret = afr_sh_inode_set_read_ctx (sh, this); - if (ret) - afr_sh_data_fail (frame, this); - else - afr_sh_set_timestamps (frame, this); - } - - return 0; -} - -int -afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) -{ - int call_count = -1; + sh = &local->self_heal; afr_sh_common_fxattrop_resp_handler (frame, cookie, this, op_ret, op_errno, xattr); call_count = afr_frame_return (frame); if (call_count == 0) { + if (!afr_sh_data_proceed (sh->success_count)) { + gf_log (this->name, GF_LOG_ERROR, "%s, inspecting " + "change log succeeded on < %d children", + local->loc.path, AFR_SH_MIN_PARTICIPANTS); + afr_sh_data_fail (frame, this); + goto out; + } afr_sh_data_fstat (frame, this); } - +out: return 0; } int -afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this, - afr_fxattrop_cbk_t fxattrop_cbk) +afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this) { afr_self_heal_t *sh = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; - dict_t *xattr_req = NULL; + dict_t **xattr_req; int32_t *zero_pending = NULL; int call_count = 0; int i = 0; int ret = 0; + int j; priv = this->private; local = frame->local; @@ -1027,42 +1315,53 @@ afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this, local->call_count = call_count; - xattr_req = dict_new(); - if (!xattr_req) { - ret = -1; - goto out; - } - - for (i = 0; i < priv->child_count; i++) { - zero_pending = GF_CALLOC (3, sizeof (*zero_pending), - gf_afr_mt_int32_t); - if (!zero_pending) { - ret = -1; - goto out; - } - ret = dict_set_dynptr (xattr_req, priv->pending_key[i], - zero_pending, - 3 * sizeof (*zero_pending)); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value"); - goto out; - } else { - zero_pending = NULL; - } - } + xattr_req = GF_CALLOC(priv->child_count, sizeof(struct dict_t *), + gf_afr_mt_dict_t); + if (!xattr_req) + goto out; + + for (i = 0; i < priv->child_count; i++) { + xattr_req[i] = dict_new(); + if (!xattr_req[i]) { + ret = -1; + goto out; + } + } + + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + zero_pending = GF_CALLOC (3, sizeof (*zero_pending), + gf_afr_mt_int32_t); + if (!zero_pending) { + ret = -1; + goto out; + } + ret = dict_set_dynptr (xattr_req[i], priv->pending_key[j], + zero_pending, + 3 * sizeof (*zero_pending)); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "Unable to set dict value"); + goto out; + } else { + zero_pending = NULL; + } + } + } afr_reset_xattr (sh->xattr, priv->child_count); afr_reset_children (sh->success_children, priv->child_count); + memset (sh->child_errno, 0, + sizeof (*sh->child_errno) * priv->child_count); sh->success_count = 0; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, fxattrop_cbk, + STACK_WIND_COOKIE (frame, afr_sh_data_fxattrop_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->fxattrop, sh->healing_fd, GF_XATTROP_ADD_ARRAY, - xattr_req); + xattr_req[i], NULL); if (!--call_count) break; @@ -1070,14 +1369,16 @@ afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this, } out: - if (xattr_req) - dict_unref (xattr_req); + if (xattr_req) { + for (i = 0; i < priv->child_count; i++) + if (xattr_req[i]) + dict_unref(xattr_req[i]); + GF_FREE(xattr_req); + } if (ret) { - if (zero_pending) - GF_FREE (zero_pending); - sh->op_failed = 1; - afr_sh_data_done (frame, this); + GF_FREE (zero_pending); + afr_sh_data_fail (frame, this); } return 0; @@ -1093,7 +1394,23 @@ afr_sh_data_big_lock_success (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; sh->data_lock_held = _gf_true; - afr_sh_data_fxattrop (frame, this, afr_sh_data_fxattrop_cbk); + afr_sh_data_fxattrop (frame, this); + return 0; +} + +int +afr_sh_dom_lock_success (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + sh->sh_dom_lock_held = _gf_true; + afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, + afr_sh_data_big_lock_success, + afr_sh_data_fail); return 0; } @@ -1110,14 +1427,16 @@ afr_sh_data_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) if (int_lock->lock_op_ret < 0) { gf_log (this->name, GF_LOG_ERROR, "Blocking data inodelks " - "failed for %s. by %"PRIu64, - local->loc.path, frame->root->lk_owner); + "failed for %s. by %s", + local->loc.path, lkowner_utoa (&frame->root->lk_owner)); + sh->data_lock_failure_handler (frame, this); } else { gf_log (this->name, GF_LOG_DEBUG, "Blocking data inodelks " - "done for %s by %"PRIu64". Proceding to self-heal", - local->loc.path, frame->root->lk_owner); + "done for %s by %s. Proceding to self-heal", + local->loc.path, lkowner_utoa (&frame->root->lk_owner)); + sh->data_lock_success_handler (frame, this); } @@ -1137,15 +1456,21 @@ afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) if (int_lock->lock_op_ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks " - "failed for %s. by %"PRIu64, - local->loc.path, frame->root->lk_owner); - int_lock->lock_cbk = afr_sh_data_post_blocking_inodelk_cbk; - afr_blocking_lock (frame, this); + "failed for %s. by %s", + local->loc.path, lkowner_utoa (&frame->root->lk_owner)); + + if (!sh->data_lock_block) { + sh->data_lock_failure_handler(frame, this); + } else { + int_lock->lock_cbk = + afr_sh_data_post_blocking_inodelk_cbk; + afr_blocking_lock (frame, this); + } } else { gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks " - "done for %s by %"PRIu64". Proceeding to self-heal", - local->loc.path, frame->root->lk_owner); + "done for %s by %s. Proceeding to self-heal", + local->loc.path, lkowner_utoa (&frame->root->lk_owner)); sh->data_lock_success_handler (frame, this); } @@ -1153,9 +1478,11 @@ afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) } int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, off_t start, off_t len) +afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, char *dom, + off_t start, off_t len) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; local = frame->local; @@ -1166,11 +1493,14 @@ afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, off_t start, off_t le afr_set_lock_number (frame, this); - int_lock->lk_flock.l_start = start; - int_lock->lk_flock.l_len = len; - int_lock->lk_flock.l_type = F_WRLCK; int_lock->lock_cbk = afr_sh_data_post_nonblocking_inodelk_cbk; + int_lock->domain = dom; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + inodelk->flock.l_start = start; + inodelk->flock.l_len = len; + inodelk->flock.l_type = F_WRLCK; + afr_nonblocking_inodelk (frame, this); return 0; @@ -1189,7 +1519,8 @@ afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this) sh_loop_finish (sh->old_loop_frame, this); sh->old_loop_frame = NULL; sh->data_lock_held = _gf_true; - afr_sh_data_fxattrop (frame, this, afr_post_sh_data_fxattrop_cbk); + sh->sync_done = _gf_true; + afr_sh_data_fxattrop (frame, this); return 0; } @@ -1212,8 +1543,8 @@ afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this) int afr_sh_data_lock (call_frame_t *frame, xlator_t *this, - off_t start, off_t len, - afr_lock_cbk_t success_handler, + off_t start, off_t len, gf_boolean_t block, + char *dom, afr_lock_cbk_t success_handler, afr_lock_cbk_t failure_handler) { afr_local_t * local = NULL; @@ -1224,12 +1555,13 @@ afr_sh_data_lock (call_frame_t *frame, xlator_t *this, sh->data_lock_success_handler = success_handler; sh->data_lock_failure_handler = failure_handler; - return afr_sh_data_lock_rec (frame, this, start, len); + sh->data_lock_block = block; + return afr_sh_data_lock_rec (frame, this, dom, start, len); } int afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; @@ -1255,20 +1587,20 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + } else { + gf_log (this->name, GF_LOG_TRACE, + "open of %s succeeded on child %s", + local->loc.path, + priv->children[child_index]->name); } - - gf_log (this->name, GF_LOG_TRACE, - "open of %s succeeded on child %s", - local->loc.path, - priv->children[child_index]->name); } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) { - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { afr_sh_data_fail (frame, this); return 0; } @@ -1277,9 +1609,8 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "fd for %s opened, commencing sync", local->loc.path); - afr_sh_data_lock (frame, this, 0, 0, - afr_sh_data_big_lock_success, - afr_sh_data_fail); + afr_sh_data_lock (frame, this, 0, 0, _gf_true, priv->sh_domain, + afr_sh_dom_lock_success, afr_sh_data_fail); } return 0; @@ -1316,7 +1647,7 @@ afr_sh_data_open (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->open, &local->loc, - O_RDWR|O_LARGEFILE, fd, 0); + O_RDWR|O_LARGEFILE, fd, NULL); if (!--call_count) break; @@ -1325,20 +1656,93 @@ afr_sh_data_open (call_frame_t *frame, xlator_t *this) return 0; } +void +afr_sh_non_reg_fix (call_frame_t *frame, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + afr_local_t *local = NULL; + int i = 0; + + if (op_ret < 0) { + afr_sh_data_fail (frame, this); + return; + } + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + for (i = 0; i < priv->child_count ; i++) { + if (1 == local->child_up[i]) + sh->success[i] = 1; + } + + afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION, + afr_sh_data_erase_pending_cbk, + afr_sh_data_finish); +} int -afr_self_heal_data (call_frame_t *frame, xlator_t *this) +afr_sh_non_reg_lock_success (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; - afr_private_t *priv = this->private; local = frame->local; sh = &local->self_heal; + sh->data_lock_held = _gf_true; + afr_sh_common_lookup (frame, this, &local->loc, + afr_sh_non_reg_fix, NULL, + AFR_LOOKUP_FAIL_CONFLICTS | + AFR_LOOKUP_FAIL_MISSING_GFIDS, + NULL); + return 0; +} +gf_boolean_t +afr_can_start_data_self_heal (afr_self_heal_t *sh, afr_private_t *priv) +{ + if (sh->force_confirm_spb) + return _gf_true; if (sh->do_data_self_heal && - afr_data_self_heal_enabled (priv->data_self_heal)) { - afr_sh_data_open (frame, this); + afr_data_self_heal_enabled (priv->data_self_heal)) + return _gf_true; + return _gf_false; +} + +int +afr_self_heal_data (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = this->private; + int ret = -1; + + local = frame->local; + sh = &local->self_heal; + + sh->sh_type_in_action = AFR_SELF_HEAL_DATA; + + if (afr_can_start_data_self_heal (sh, priv)) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); + ret = afr_inodelk_init (&local->internal_lock.inodelk[1], + priv->sh_domain, priv->child_count); + if (ret < 0) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_data_done (frame, this); + return 0; + } + + if (IA_ISREG (sh->type)) { + afr_sh_data_open (frame, this); + } else { + afr_sh_data_lock (frame, this, 0, 0, _gf_true, + this->name, + afr_sh_non_reg_lock_success, + afr_sh_data_fail); + } } else { gf_log (this->name, GF_LOG_TRACE, "not doing data self heal on %s", diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 356b15e63..53491a1d7 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -49,6 +40,18 @@ #include "afr-self-heal.h" #include "afr-self-heal-common.h" +#define AFR_INIT_SH_FRAME_VALS(_frame, _local, _sh, _sh_frame, _sh_local, _sh_sh)\ + do {\ + _local = _frame->local;\ + _sh = &_local->self_heal;\ + _sh_frame = _sh->sh_frame;\ + _sh_local = _sh_frame->local;\ + _sh_sh = &_sh_local->self_heal;\ + } while (0); + +int +afr_sh_entry_impunge_create_file (call_frame_t *impunge_frame, xlator_t *this, + int child_index); int afr_sh_entry_done (call_frame_t *frame, xlator_t *this) { @@ -58,10 +61,6 @@ afr_sh_entry_done (call_frame_t *frame, xlator_t *this) local = frame->local; sh = &local->self_heal; - if (sh->healing_fd) - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; - sh->completion_cbk (frame, this); return 0; @@ -103,7 +102,7 @@ afr_sh_entry_finish (call_frame_t *frame, xlator_t *this) int afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) + int32_t op_errno, dict_t *xattr, dict_t *xdata) { long i = 0; int call_count = 0; @@ -158,66 +157,20 @@ afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; - int need_unwind = 0; local = frame->local; sh = &local->self_heal; - priv = this->private; - - afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, sh->success, - priv->child_count, AFR_ENTRY_TRANSACTION); - - erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, - gf_afr_mt_dict_t); - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; - - erase_xattr[i] = get_new_dict(); - dict_ref (erase_xattr[i]); - } - } - - if (call_count == 0) - need_unwind = 1; - - afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, - priv->child_count, AFR_ENTRY_TRANSACTION); - - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "erasing pending flags from %s on %s", - local->loc.path, priv->children[i]->name); - STACK_WIND_COOKIE (frame, afr_sh_entry_erase_pending_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, erase_xattr[i]); - if (!--call_count) - break; - } - - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); - } + if (sh->entries_skipped) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + goto out; } - GF_FREE (erase_xattr); - - if (need_unwind) - afr_sh_entry_finish (frame, this); - + afr_sh_erase_pending (frame, this, AFR_ENTRY_TRANSACTION, + afr_sh_entry_erase_pending_cbk, + afr_sh_entry_finish); + return 0; +out: + afr_sh_entry_finish (frame, this); return 0; } @@ -298,57 +251,11 @@ next_active_sink (call_frame_t *frame, xlator_t *this, return next_active_sink; } - -int -build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) -{ - int ret = -1; - - if (!child) { - goto out; - } - - if (strcmp (parent->path, "/") == 0) - ret = gf_asprintf ((char **)&child->path, "/%s", name); - else - ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path, - name); - - if (-1 == ret) { - gf_log (this->name, GF_LOG_ERROR, - "asprintf failed while setting child path"); - } - - if (!child->path) { - goto out; - } - - child->name = strrchr (child->path, '/'); - if (child->name) - child->name++; - - child->parent = inode_ref (parent->inode); - child->inode = inode_new (parent->inode->table); - - if (!child->inode) { - goto out; - } - - ret = 0; -out: - if (ret == -1) - loc_wipe (child); - - return ret; -} - - int afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this); int -afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src); +afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this); int afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this); @@ -376,7 +283,8 @@ int afr_sh_entry_expunge_parent_setattr_cbk (call_frame_t *expunge_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) + struct iatt *preop, struct iatt *postop, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *expunge_local = NULL; @@ -412,7 +320,7 @@ afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *expunge_local = NULL; @@ -440,7 +348,6 @@ afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie, } valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - afr_build_parent_loc (&expunge_sh->parent_loc, &expunge_local->loc); STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_parent_setattr_cbk, (void *) (long) active_src, @@ -448,7 +355,7 @@ afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie, priv->children[active_src]->fops->setattr, &expunge_sh->parent_loc, &expunge_sh->parentbuf, - valid); + valid, NULL); return 0; } @@ -472,7 +379,7 @@ afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this, (void *) (long) active_src, priv->children[active_src], priv->children[active_src]->fops->unlink, - &expunge_local->loc); + &expunge_local->loc, 0, NULL); return 0; } @@ -497,7 +404,7 @@ afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this, (void *) (long) active_src, priv->children[active_src], priv->children[active_src]->fops->rmdir, - &expunge_local->loc, 1); + &expunge_local->loc, 1, NULL); return 0; } @@ -505,7 +412,8 @@ afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this, int afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, - int active_src, struct iatt *buf) + int active_src, struct iatt *buf, + struct iatt *parentbuf) { afr_private_t *priv = NULL; afr_local_t *expunge_local = NULL; @@ -514,6 +422,7 @@ afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, int type = 0; afr_self_heal_t *sh = NULL; afr_local_t *local = NULL; + loc_t *loc = NULL; priv = this->private; expunge_local = expunge_frame->local; @@ -521,8 +430,11 @@ afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, frame = expunge_sh->sh_frame; local = frame->local; sh = &local->self_heal; + loc = &expunge_local->loc; type = buf->ia_type; + if (loc->parent && uuid_is_null (loc->parent->gfid)) + uuid_copy (loc->pargfid, parentbuf->ia_gfid); switch (type) { case IA_IFSOCK: @@ -586,7 +498,8 @@ afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie, goto out; } - afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf); + afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf, + postparent); return 0; out: @@ -615,7 +528,7 @@ afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this, (void *) (long) active_src, priv->children[active_src], priv->children[active_src]->fops->lookup, - &expunge_local->loc, 0); + &expunge_local->loc, NULL); return 0; } @@ -667,7 +580,8 @@ afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie, if (need_expunge) { gf_log (this->name, GF_LOG_INFO, - "missing entry %s on %s", + "Entry %s is missing on %s and deleting from " + "replica's other bricks", expunge_local->loc.path, priv->children[source]->name); @@ -699,6 +613,19 @@ out: return 0; } +static gf_boolean_t +can_skip_entry_self_heal (char *name, loc_t *parent_loc) +{ + if (strcmp (name, ".") == 0) { + return _gf_true; + } else if (strcmp (name, "..") == 0) { + return _gf_true; + } else if (loc_is_root (parent_loc) && + (strcmp (name, GF_REPLICATE_TRASH_DIR) == 0)) { + return _gf_true; + } + return _gf_false; +} int afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, @@ -726,21 +653,13 @@ afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, sh->expunge_done = afr_sh_entry_expunge_entry_done; name = entry->d_name; - - if ((strcmp (name, ".") == 0) - || (strcmp (name, "..") == 0) - || ((strcmp (local->loc.path, "/") == 0) - && (strcmp (name, GF_REPLICATE_TRASH_DIR) == 0))) { - - gf_log (this->name, GF_LOG_TRACE, - "skipping inspection of %s under %s", - name, local->loc.path); + if (can_skip_entry_self_heal (name, &local->loc)) { op_ret = 0; goto out; } gf_log (this->name, GF_LOG_TRACE, - "inspecting existance of %s under %s", + "inspecting existence of %s under %s", name, local->loc.path); expunge_frame = copy_frame (frame); @@ -749,15 +668,17 @@ afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, goto out; } - ALLOC_OR_GOTO (expunge_local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out); expunge_frame->local = expunge_local; expunge_sh = &expunge_local->self_heal; expunge_sh->sh_frame = frame; expunge_sh->active_source = active_src; expunge_sh->entrybuf = entry->d_stat; + loc_copy (&expunge_sh->parent_loc, &local->loc); - ret = build_child_loc (this, &expunge_local->loc, &local->loc, name); + ret = afr_build_child_loc (this, &expunge_local->loc, &local->loc, + name); if (ret != 0) { op_errno = EINVAL; goto out; @@ -772,7 +693,7 @@ afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, (void *) (long) source, priv->children[source], priv->children[source]->fops->lookup, - &expunge_local->loc, 0); + &expunge_local->loc, NULL); ret = 0; out: @@ -787,7 +708,7 @@ int afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) + gf_dirent_t *entries, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -855,7 +776,7 @@ afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk, priv->children[active_src], priv->children[active_src]->fops->readdirp, - sh->healing_fd, sh->block_size, sh->offset); + sh->healing_fd, sh->block_size, sh->offset, NULL); return 0; } @@ -885,7 +806,7 @@ afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this) active_src = next_active_sink (frame, this, sh->active_source); sh->active_source = active_src; - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { goto out; } @@ -910,46 +831,58 @@ out: int afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this, - int active_src, int32_t op_ret, - int32_t op_errno) + int32_t op_ret, int32_t op_errno) { int call_count = 0; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + local = frame->local; + sh = &local->self_heal; + if (op_ret < 0) + sh->entries_skipped = _gf_true; call_count = afr_frame_return (frame); - if (call_count == 0) - afr_sh_entry_impunge_subvol (frame, this, active_src); + afr_sh_entry_impunge_subvol (frame, this); return 0; } +void +afr_sh_entry_call_impunge_done (call_frame_t *impunge_frame, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *impunge_local = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_self_heal_t *impunge_sh = NULL; + call_frame_t *frame = NULL; + + AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, + frame, local, sh); + + AFR_STACK_DESTROY (impunge_frame); + sh->impunge_done (frame, this, op_ret, op_errno); +} int afr_sh_entry_impunge_setattr_cbk (call_frame_t *impunge_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) + struct iatt *preop, struct iatt *postop, + dict_t *xdata) { int call_count = 0; afr_private_t *priv = NULL; afr_local_t *impunge_local = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *frame = NULL; int child_index = 0; - int32_t impunge_ret_child = 0; priv = this->private; impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; child_index = (long) cookie; if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, + gf_log (this->name, GF_LOG_DEBUG, "setattr done for %s on %s", impunge_local->loc.path, priv->children[child_index]->name); @@ -961,34 +894,114 @@ afr_sh_entry_impunge_setattr_cbk (call_frame_t *impunge_frame, void *cookie, strerror (op_errno)); } - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; + call_count = afr_frame_return (impunge_frame); + if (call_count == 0) { + afr_sh_entry_call_impunge_done (impunge_frame, this, + 0, op_errno); } - UNLOCK (&impunge_frame->lock); - if (call_count == 0) { - impunge_ret_child = impunge_sh->impunge_ret_child; - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, impunge_ret_child, op_ret, - op_errno); + return 0; +} + +int +afr_sh_entry_impunge_parent_setattr_cbk (call_frame_t *setattr_frame, + void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, + dict_t *xdata) +{ + int call_count = 0; + afr_local_t *setattr_local = NULL; + + setattr_local = setattr_frame->local; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_INFO, + "setattr on parent directory (%s) failed: %s", + setattr_local->loc.path, strerror (op_errno)); } + call_count = afr_frame_return (setattr_frame); + if (call_count == 0) + AFR_STACK_DESTROY (setattr_frame); return 0; } +int +afr_sh_entry_impunge_setattr (call_frame_t *impunge_frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_local_t *setattr_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + call_frame_t *setattr_frame = NULL; + int32_t valid = 0; + int32_t op_errno = 0; + int child_index = 0; + int call_count = 0; + int i = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + + gf_log (this->name, GF_LOG_DEBUG, + "setting ownership of %s on %s to %d/%d", + impunge_local->loc.path, + priv->children[child_index]->name, + impunge_sh->entrybuf.ia_uid, + impunge_sh->entrybuf.ia_gid); + + setattr_frame = copy_frame (impunge_frame); + if (!setattr_frame) { + op_errno = ENOMEM; + goto out; + } + AFR_LOCAL_ALLOC_OR_GOTO (setattr_frame->local, out); + setattr_local = setattr_frame->local; + call_count = afr_errno_count (NULL, impunge_sh->child_errno, + priv->child_count, 0); + loc_copy (&setattr_local->loc, &impunge_sh->parent_loc); + impunge_local->call_count = call_count; + setattr_local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (impunge_sh->child_errno[i]) + continue; + valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; + STACK_WIND_COOKIE (setattr_frame, + afr_sh_entry_impunge_parent_setattr_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->setattr, + &setattr_local->loc, + &impunge_sh->parentbuf, valid, NULL); + + valid = GF_SET_ATTR_UID | GF_SET_ATTR_GID | + GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; + STACK_WIND_COOKIE (impunge_frame, + afr_sh_entry_impunge_setattr_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->setattr, + &impunge_local->loc, + &impunge_sh->entrybuf, valid, NULL); + call_count--; + } + GF_ASSERT (!call_count); + return 0; +out: + if (setattr_frame) + AFR_STACK_DESTROY (setattr_frame); + afr_sh_entry_call_impunge_done (impunge_frame, this, 0, op_errno); + return 0; +} int afr_sh_entry_impunge_xattrop_cbk (call_frame_t *impunge_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) + dict_t *xattr, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *impunge_local = NULL; int child_index = 0; - struct iatt stbuf = {0}; - int32_t valid = 0; priv = this->private; impunge_local = impunge_frame->local; @@ -1001,188 +1014,227 @@ afr_sh_entry_impunge_xattrop_cbk (call_frame_t *impunge_frame, void *cookie, impunge_local->loc.path, priv->children[child_index]->name, strerror (op_errno)); + goto out; } - gf_log (this->name, GF_LOG_TRACE, - "setting ownership of %s on %s to %d/%d", - impunge_local->loc.path, - priv->children[child_index]->name, - impunge_local->cont.lookup.buf.ia_uid, - impunge_local->cont.lookup.buf.ia_gid); - - stbuf.ia_atime = impunge_local->cont.lookup.buf.ia_atime; - stbuf.ia_atime_nsec = impunge_local->cont.lookup.buf.ia_atime_nsec; - stbuf.ia_mtime = impunge_local->cont.lookup.buf.ia_mtime; - stbuf.ia_mtime_nsec = impunge_local->cont.lookup.buf.ia_mtime_nsec; - - stbuf.ia_uid = impunge_local->cont.lookup.buf.ia_uid; - stbuf.ia_gid = impunge_local->cont.lookup.buf.ia_gid; - - valid = GF_SET_ATTR_UID | GF_SET_ATTR_GID | - GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_setattr_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->setattr, - &impunge_local->loc, - &stbuf, valid); + afr_sh_entry_impunge_setattr (impunge_frame, this); + return 0; +out: + afr_sh_entry_call_impunge_done (impunge_frame, this, + -1, op_errno); return 0; } - int -afr_sh_entry_impunge_parent_setattr_cbk (call_frame_t *setattr_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) +afr_sh_entry_impunge_perform_xattrop (call_frame_t *impunge_frame, + xlator_t *this) { - loc_t *parent_loc = cookie; + int active_src = 0; + dict_t *xattr = NULL; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int32_t op_errno = 0; - if (op_ret != 0) { - gf_log (this->name, GF_LOG_INFO, - "setattr on parent directory (%s) failed: %s", - parent_loc->path, strerror (op_errno)); + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + active_src = impunge_sh->active_source; + + afr_prepare_new_entry_pending_matrix (impunge_local->pending, + afr_is_errno_unset, + impunge_sh->child_errno, + &impunge_sh->entrybuf, + priv->child_count); + xattr = dict_new (); + if (!xattr) { + op_errno = ENOMEM; + goto out; } - loc_wipe (parent_loc); + afr_set_pending_dict (priv, xattr, impunge_local->pending, active_src, + LOCAL_LAST); - GF_FREE (parent_loc); + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_xattrop_cbk, + (void *) (long) active_src, + priv->children[active_src], + priv->children[active_src]->fops->xattrop, + &impunge_local->loc, GF_XATTROP_ADD_ARRAY, xattr, NULL); - AFR_STACK_DESTROY (setattr_frame); + if (xattr) + dict_unref (xattr); + return 0; +out: + afr_sh_entry_call_impunge_done (impunge_frame, this, + -1, op_errno); return 0; } - int afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { int call_count = 0; afr_private_t *priv = NULL; afr_local_t *impunge_local = NULL; afr_self_heal_t *impunge_sh = NULL; - call_frame_t *frame = NULL; - int active_src = 0; int child_index = 0; - int32_t *pending_array = NULL; - dict_t *xattr = NULL; - int ret = 0; - int idx = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - call_frame_t *setattr_frame = NULL; - int32_t valid = 0; - loc_t *parent_loc = NULL; - struct iatt parentbuf = {0,}; - int32_t impunge_ret_child = 0; priv = this->private; impunge_local = impunge_frame->local; impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - active_src = impunge_sh->active_source; child_index = (long) cookie; if (op_ret == -1) { - ret = -1; + impunge_sh->child_errno[child_index] = op_errno; gf_log (this->name, GF_LOG_ERROR, "creation of %s on %s failed (%s)", impunge_local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - goto out; + } else { + impunge_sh->child_errno[child_index] = 0; } - inode->ia_type = stbuf->ia_type; - - xattr = dict_new (); - if (!xattr) { - ret = -1; - goto out; + call_count = afr_frame_return (impunge_frame); + if (call_count == 0) { + if (!afr_errno_count (NULL, impunge_sh->child_errno, + priv->child_count, 0)) { + // new_file creation failed every where + afr_sh_entry_call_impunge_done (impunge_frame, this, + -1, op_errno); + goto out; + } + afr_sh_entry_impunge_perform_xattrop (impunge_frame, this); } +out: + return 0; +} - pending_array = (int32_t*) GF_CALLOC (3, sizeof (*pending_array), - gf_afr_mt_int32_t); +int +afr_sh_entry_impunge_hardlink_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int call_count = 0; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; - if (!pending_array) { - ret = -1; - goto out; - } - idx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); - pending_array[idx] = hton32 (1); - if (IA_ISDIR (stbuf->ia_type)) - idx = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); - else - idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - pending_array[idx] = hton32 (1); - - ret = dict_set_dynptr (xattr, priv->pending_key[child_index], - pending_array, - 3 * sizeof (*pending_array)); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); - } else { - pending_array = NULL; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + + if (IA_IFLNK == impunge_sh->entrybuf.ia_type) { + //For symlinks impunge is attempted un-conditionally + //So the file can already exist. + if ((op_ret < 0) && (op_errno == EEXIST)) + op_ret = 0; } - valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - parentbuf = impunge_sh->parentbuf; - setattr_frame = copy_frame (impunge_frame); + call_count = afr_frame_return (impunge_frame); + if (call_count == 0) + afr_sh_entry_call_impunge_done (impunge_frame, this, + op_ret, op_errno); - parent_loc = GF_CALLOC (1, sizeof (*parent_loc), - gf_afr_mt_loc_t); - if (!parent_loc) { - ret = -1; - goto out; - } - afr_build_parent_loc (parent_loc, &impunge_local->loc); + return 0; +} - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_xattrop_cbk, - (void *) (long) child_index, - priv->children[active_src], - priv->children[active_src]->fops->xattrop, - &impunge_local->loc, GF_XATTROP_ADD_ARRAY, xattr); +int +afr_sh_entry_impunge_hardlink (call_frame_t *impunge_frame, xlator_t *this, + int child_index) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + loc_t *loc = NULL; + struct iatt *buf = NULL; + loc_t oldloc = {0}; - STACK_WIND_COOKIE (setattr_frame, afr_sh_entry_impunge_parent_setattr_cbk, - (void *) (long) parent_loc, - priv->children[child_index], - priv->children[child_index]->fops->setattr, - parent_loc, &parentbuf, valid); + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + loc = &impunge_local->loc; + buf = &impunge_sh->entrybuf; -out: - if (xattr) - dict_unref (xattr); + oldloc.inode = inode_ref (loc->inode); + uuid_copy (oldloc.gfid, buf->ia_gfid); + gf_log (this->name, GF_LOG_DEBUG, "linking missing file %s on %s", + loc->path, priv->children[child_index]->name); - if (ret) { - if (pending_array) - GF_FREE (pending_array); + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_hardlink_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->link, + &oldloc, loc, NULL); + loc_wipe (&oldloc); - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); + return 0; +} - if (call_count == 0) { - impunge_ret_child = impunge_sh->impunge_ret_child; - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, impunge_ret_child, -1, - op_errno); - } +int +afr_sh_nameless_lookup_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xattr, + struct iatt *postparent) +{ + if (op_ret < 0) { + afr_sh_entry_impunge_create_file (impunge_frame, this, + (long)cookie); + } else { + afr_sh_entry_impunge_hardlink (impunge_frame, this, + (long)cookie); } - return 0; } +int +afr_sh_entry_impunge_check_hardlink (call_frame_t *impunge_frame, + xlator_t *this, + int child_index, struct iatt *stbuf) +{ + afr_private_t *priv = NULL; + call_frame_t *frame = NULL; + afr_local_t *impunge_local = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *impunge_sh = NULL; + afr_self_heal_t *sh = NULL; + loc_t *loc = NULL; + dict_t *xattr_req = NULL; + loc_t oldloc = {0}; + int ret = -1; + + priv = this->private; + AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, + frame, local, sh); + loc = &impunge_local->loc; + + xattr_req = dict_new (); + if (!xattr_req) + goto out; + oldloc.inode = inode_ref (loc->inode); + uuid_copy (oldloc.gfid, stbuf->ia_gfid); + + STACK_WIND_COOKIE (impunge_frame, afr_sh_nameless_lookup_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->lookup, + &oldloc, xattr_req); + ret = 0; +out: + if (xattr_req) + dict_unref (xattr_req); + loc_wipe (&oldloc); + if (ret) + sh->impunge_done (frame, this, -1, ENOMEM); + return 0; +} int afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, @@ -1205,11 +1257,41 @@ afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, if (!dict) gf_log (this->name, GF_LOG_ERROR, "Out of memory"); + GF_ASSERT (!uuid_is_null (stbuf->ia_gfid)); ret = afr_set_dict_gfid (dict, stbuf->ia_gfid); if (ret) gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed", impunge_local->loc.path); + /* + * Reason for adding GLUSTERFS_INTERNAL_FOP_KEY : + * + * Problem: + * While a brick is down in a replica pair, lets say the user creates + * one file(file-A) and a hard link to that file(h-file-A). After the + * brick comes back up, entry self-heal is attempted on parent dir of + * these two files. As part of readdir in self-heal it reads both the + * entries file-A and h-file-A for both of them it does name less lookup + * to check if there are any hardlinks already present in the + * destination brick. It finds that there are no hard links already + * present for files file-A, h-file-A. Self-heal does mknods for both + * file-A and h-file-A. This leads to file-A and h-file-A not being + * hardlinks anymore. + * + * Fix: (More like shrinking of race-window, the race itself is still + * present in posix-mknod). + * If mknod comes with the presence of GLUSTERFS_INTERNAL_FOP_KEY then + * posix_mknod checks if there are already any gfid-links and does + * link() instead of mknod. There still can be a race where two + * posix_mknods same gfid see that + * gfid-link file is not present and proceeds with mknods and result in + * two different files with same gfid. + */ + ret = dict_set_str (dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); + if (ret) + gf_log (this->name, GF_LOG_INFO, "%s: %s set failed", + impunge_local->loc.path, GLUSTERFS_INTERNAL_FOP_KEY); + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, (void *) (long) child_index, priv->children[child_index], @@ -1217,7 +1299,7 @@ afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, &impunge_local->loc, st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type), makedev (ia_major (stbuf->ia_rdev), - ia_minor (stbuf->ia_rdev)), dict); + ia_minor (stbuf->ia_rdev)), 0, dict); if (dict) dict_unref (dict); @@ -1247,6 +1329,7 @@ afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this, return 0; } + GF_ASSERT (!uuid_is_null (stbuf->ia_gfid)); ret = afr_set_dict_gfid (dict, stbuf->ia_gfid); if (ret) gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed", @@ -1263,7 +1346,7 @@ afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this, priv->children[child_index]->fops->mkdir, &impunge_local->loc, st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type), - dict); + 0, dict); if (dict) dict_unref (dict); @@ -1281,32 +1364,20 @@ afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this, dict_t *dict = NULL; struct iatt *buf = NULL; int ret = 0; - call_frame_t *frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *impunge_sh = NULL; - int32_t impunge_ret_child = 0; priv = this->private; impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - buf = &impunge_local->cont.symlink.buf; + buf = &impunge_local->cont.dir_fop.buf; dict = dict_new (); if (!dict) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - impunge_ret_child = impunge_sh->impunge_ret_child; - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (impunge_frame, this, impunge_ret_child, -1, - ENOMEM); + afr_sh_entry_call_impunge_done (impunge_frame, this, + -1, ENOMEM); goto out; } + GF_ASSERT (!uuid_is_null (buf->ia_gfid)); ret = afr_set_dict_gfid (dict, buf->ia_gfid); if (ret) gf_log (this->name, GF_LOG_INFO, @@ -1322,7 +1393,7 @@ afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this, (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->symlink, - linkname, &impunge_local->loc, dict); + linkname, &impunge_local->loc, 0, dict); if (dict) dict_unref (dict); @@ -1336,24 +1407,17 @@ afr_sh_entry_impunge_symlink_unlink_cbk (call_frame_t *impunge_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *impunge_local = NULL; afr_self_heal_t *impunge_sh = NULL; int child_index = -1; - call_frame_t *frame = NULL; int call_count = -1; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int32_t impunge_ret_child = 0; priv = this->private; impunge_local = impunge_frame->local; impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; child_index = (long) cookie; @@ -1377,12 +1441,9 @@ out: } UNLOCK (&impunge_frame->lock); - if (call_count == 0) { - impunge_ret_child = impunge_sh->impunge_ret_child; - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, impunge_ret_child, op_ret, - op_errno); - } + if (call_count == 0) + afr_sh_entry_call_impunge_done (impunge_frame, this, + op_ret, op_errno); return 0; } @@ -1407,7 +1468,7 @@ afr_sh_entry_impunge_symlink_unlink (call_frame_t *impunge_frame, xlator_t *this (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->unlink, - &impunge_local->loc); + &impunge_local->loc, 0, NULL); return 0; } @@ -1417,25 +1478,18 @@ int afr_sh_entry_impunge_readlink_sink_cbk (call_frame_t *impunge_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - const char *linkname, struct iatt *sbuf) + const char *linkname, struct iatt *sbuf, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *impunge_local = NULL; afr_self_heal_t *impunge_sh = NULL; int child_index = -1; - call_frame_t *frame = NULL; int call_count = -1; int active_src = -1; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int32_t impunge_ret_child = 0; priv = this->private; impunge_local = impunge_frame->local; impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; active_src = impunge_sh->active_source; child_index = (long) cookie; @@ -1480,12 +1534,9 @@ out: } UNLOCK (&impunge_frame->lock); - if (call_count == 0) { - impunge_ret_child = impunge_sh->impunge_ret_child; - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, impunge_ret_child, op_ret, - op_errno); - } + if (call_count == 0) + afr_sh_entry_call_impunge_done (impunge_frame, this, + op_ret, op_errno); return 0; } @@ -1509,7 +1560,7 @@ afr_sh_entry_impunge_readlink_sink (call_frame_t *impunge_frame, xlator_t *this, (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->readlink, - &impunge_local->loc, 4096); + &impunge_local->loc, 4096, NULL); return 0; } @@ -1519,25 +1570,18 @@ int afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - const char *linkname, struct iatt *sbuf) + const char *linkname, struct iatt *sbuf, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *impunge_local = NULL; afr_self_heal_t *impunge_sh = NULL; int child_index = -1; - call_frame_t *frame = NULL; int call_count = -1; int active_src = -1; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int32_t impunge_ret_child = 0; priv = this->private; impunge_local = impunge_frame->local; impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; active_src = impunge_sh->active_source; child_index = (long) cookie; @@ -1563,12 +1607,9 @@ out: } UNLOCK (&impunge_frame->lock); - if (call_count == 0) { - impunge_ret_child = impunge_sh->impunge_ret_child; - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, impunge_ret_child, op_ret, - op_errno); - } + if (call_count == 0) + afr_sh_entry_call_impunge_done (impunge_frame, this, + op_ret, op_errno); return 0; } @@ -1587,36 +1628,38 @@ afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this, impunge_local = impunge_frame->local; impunge_sh = &impunge_local->self_heal; active_src = impunge_sh->active_source; - impunge_local->cont.symlink.buf = *stbuf; + impunge_local->cont.dir_fop.buf = *stbuf; STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk, (void *) (long) child_index, priv->children[active_src], priv->children[active_src]->fops->readlink, - &impunge_local->loc, 4096); + &impunge_local->loc, 4096, NULL); return 0; } int afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *buf, - struct iatt *postparent) + int child_index) { - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; + call_frame_t *frame = NULL; + afr_local_t *impunge_local = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *impunge_sh = NULL; + afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; ia_type_t type = IA_INVAL; - int ret = 0; int active_src = 0; + struct iatt *buf = NULL; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - impunge_sh->parentbuf = *postparent; + AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, + frame, local, sh); active_src = impunge_sh->active_source; - impunge_local->cont.lookup.buf = *buf; - afr_update_loc_gfids (&impunge_local->loc, buf, postparent); + afr_update_loc_gfids (&impunge_local->loc, &impunge_sh->entrybuf, + &impunge_sh->parentbuf); + buf = &impunge_sh->entrybuf; type = buf->ia_type; switch (type) { @@ -1625,12 +1668,9 @@ afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, case IA_IFBLK: case IA_IFCHR: case IA_IFIFO: - afr_sh_entry_impunge_mknod (impunge_frame, this, - child_index, buf); - break; case IA_IFLNK: - afr_sh_entry_impunge_readlink (impunge_frame, this, - child_index, buf); + afr_sh_entry_impunge_check_hardlink (impunge_frame, this, + child_index, buf); break; case IA_IFDIR: afr_sh_entry_impunge_mkdir (impunge_frame, this, @@ -1641,286 +1681,270 @@ afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, "%s has unknown file type on %s: 0%o", impunge_local->loc.path, priv->children[active_src]->name, type); - ret = -1; + sh->impunge_done (frame, this, -1, EINVAL); break; } - return ret; + return 0; } int -afr_sh_entry_impunge_recreate_lookup_cbk (call_frame_t *impunge_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, - dict_t *xattr,struct iatt *postparent) +afr_sh_entry_impunge_create_file (call_frame_t *impunge_frame, xlator_t *this, + int child_index) { + call_frame_t *frame = NULL; + afr_local_t *impunge_local = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *impunge_sh = NULL; + afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *impunge_sh = NULL; - afr_self_heal_t *sh = NULL; - int active_src = 0; - int child_index = 0; - call_frame_t *frame = NULL; - int call_count = 0; - int ret = 0; - int32_t impunge_ret_child = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - - child_index = (long) cookie; + ia_type_t type = IA_INVAL; + int active_src = 0; + struct iatt *buf = NULL; + AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, + frame, local, sh); active_src = impunge_sh->active_source; + buf = &impunge_sh->entrybuf; + type = buf->ia_type; - if (op_ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "looking up %s on %s (for %s) failed (%s)", + switch (type) { + case IA_IFSOCK: + case IA_IFREG: + case IA_IFBLK: + case IA_IFCHR: + case IA_IFIFO: + afr_sh_entry_impunge_mknod (impunge_frame, this, + child_index, buf); + break; + case IA_IFLNK: + afr_sh_entry_impunge_readlink (impunge_frame, this, + child_index, buf); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "%s has unknown file type on %s: 0%o", impunge_local->loc.path, - priv->children[active_src]->name, - priv->children[child_index]->name, - strerror (op_errno)); - goto out; + priv->children[active_src]->name, type); + sh->impunge_done (frame, this, -1, EINVAL); + break; } - ret = afr_sh_entry_impunge_create (impunge_frame, this, child_index, buf, - postparent); - if (ret) + return 0; +} + +gf_boolean_t +afr_sh_need_recreate (afr_self_heal_t *impunge_sh, unsigned int child, + unsigned int child_count) +{ + gf_boolean_t recreate = _gf_false; + + GF_ASSERT (impunge_sh->child_errno); + + if (child == impunge_sh->active_source) goto out; - return 0; + if (IA_IFLNK == impunge_sh->entrybuf.ia_type) { + recreate = _gf_true; + goto out; + } + if (impunge_sh->child_errno[child] == ENOENT) + recreate = _gf_true; out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); + return recreate; +} - if (call_count == 0) { - impunge_ret_child = impunge_sh->impunge_ret_child; - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, impunge_ret_child, op_ret, - op_errno); +unsigned int +afr_sh_recreate_count (afr_self_heal_t *impunge_sh, int *sources, + unsigned int child_count) +{ + int count = 0; + int i = 0; + + for (i = 0; i < child_count; i++) { + if (afr_sh_need_recreate (impunge_sh, i, child_count)) + count++; } - return 0; + return count; } - int -afr_sh_entry_impunge_recreate (call_frame_t *impunge_frame, xlator_t *this, - int child_index) +afr_sh_entry_call_impunge_recreate (call_frame_t *impunge_frame, + xlator_t *this) { afr_private_t *priv = NULL; afr_local_t *impunge_local = NULL; afr_self_heal_t *impunge_sh = NULL; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + unsigned int recreate_count = 0; + int i = 0; int active_src = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - + priv = this->private; + AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, + frame, local, sh); active_src = impunge_sh->active_source; - - STACK_WIND_COOKIE (impunge_frame, - afr_sh_entry_impunge_recreate_lookup_cbk, - (void *) (long) child_index, - priv->children[active_src], - priv->children[active_src]->fops->lookup, - &impunge_local->loc, 0); - + impunge_sh->entrybuf = impunge_sh->buf[active_src]; + impunge_sh->parentbuf = impunge_sh->parentbufs[active_src]; + recreate_count = afr_sh_recreate_count (impunge_sh, sh->sources, + priv->child_count); + if (!recreate_count) { + afr_sh_entry_call_impunge_done (impunge_frame, this, 0, 0); + goto out; + } + impunge_local->call_count = recreate_count; + for (i = 0; i < priv->child_count; i++) { + if (!impunge_local->child_up[i]) { + impunge_sh->child_errno[i] = ENOTCONN; + continue; + } + if (!afr_sh_need_recreate (impunge_sh, i, priv->child_count)) { + impunge_sh->child_errno[i] = EEXIST; + continue; + } + } + for (i = 0; i < priv->child_count; i++) { + if (!afr_sh_need_recreate (impunge_sh, i, priv->child_count)) + continue; + (void)afr_sh_entry_impunge_create (impunge_frame, this, i); + recreate_count--; + } + GF_ASSERT (!recreate_count); +out: return 0; } - -int -afr_sh_entry_impunge_entry_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *x, - struct iatt *postparent) +void +afr_sh_entry_common_lookup_done (call_frame_t *impunge_frame, xlator_t *this, + int32_t op_ret, int32_t op_errno) { afr_private_t *priv = NULL; afr_local_t *impunge_local = NULL; afr_self_heal_t *impunge_sh = NULL; - int call_count = 0; - int child_index = 0; call_frame_t *frame = NULL; afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; - int32_t impunge_ret_child = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - child_index = (long) cookie; - - if ((op_ret == -1 && op_errno == ENOENT) - || (IA_ISLNK (impunge_sh->impunging_entry_mode))) { - - /* - * A symlink's target might have changed, so - * always go down the recreate path for them. - */ - - /* decrease call_count in recreate-callback */ - - gf_log (this->name, GF_LOG_TRACE, - "missing entry %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - afr_sh_entry_impunge_recreate (impunge_frame, this, - child_index); - return 0; - } + unsigned int gfid_miss_count = 0; + unsigned int children_up_count = 0; + uuid_t gfid = {0}; + int active_src = 0; - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "%s exists under %s", - impunge_local->loc.path, - priv->children[child_index]->name); + priv = this->private; + AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, + frame, local, sh); + active_src = impunge_sh->active_source; - impunge_sh->parentbuf = *postparent; + if (op_ret < 0) + goto done; + if (impunge_sh->child_errno[active_src]) { + op_ret = -1; + op_errno = impunge_sh->child_errno[active_src]; + goto done; + } + + gfid_miss_count = afr_gfid_missing_count (this->name, + impunge_sh->success_children, + impunge_sh->buf, priv->child_count, + impunge_local->loc.path); + children_up_count = afr_up_children_count (impunge_local->child_up, + priv->child_count); + if ((gfid_miss_count == children_up_count) && + (children_up_count < priv->child_count)) { + op_ret = -1; + op_errno = ENODATA; + gf_log (this->name, GF_LOG_ERROR, "Not all children are up, " + "gfid should not be assigned in this state for %s", + impunge_local->loc.path); + goto done; + } + + if (gfid_miss_count) { + afr_update_gfid_from_iatts (gfid, impunge_sh->buf, + impunge_sh->success_children, + priv->child_count); + if (uuid_is_null (gfid)) { + sh->entries_skipped = _gf_true; + gf_log (this->name, GF_LOG_INFO, "%s: Skipping entry " + "self-heal because of gfid absence", + impunge_local->loc.path); + goto done; + } + afr_sh_common_lookup (impunge_frame, this, &impunge_local->loc, + afr_sh_entry_common_lookup_done, gfid, + AFR_LOOKUP_FAIL_CONFLICTS | + AFR_LOOKUP_FAIL_MISSING_GFIDS, + NULL); } else { - gf_log (this->name, GF_LOG_WARNING, - "looking up %s under %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); + afr_sh_entry_call_impunge_recreate (impunge_frame, this); } - - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - impunge_ret_child = impunge_sh->impunge_ret_child; - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, impunge_ret_child, op_ret, - op_errno); - } - - return 0; + return; +done: + afr_sh_entry_call_impunge_done (impunge_frame, this, + op_ret, op_errno); + return; } - int afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this, gf_dirent_t *entry) { - afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; + afr_self_heal_t *impunge_sh = NULL; int ret = -1; call_frame_t *impunge_frame = NULL; afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; int active_src = 0; - int i = 0; - int call_count = 0; int op_errno = 0; int op_ret = -1; - priv = this->private; local = frame->local; sh = &local->self_heal; active_src = sh->active_source; sh->impunge_done = afr_sh_entry_impunge_entry_done; - if ((strcmp (entry->d_name, ".") == 0) - || (strcmp (entry->d_name, "..") == 0) - || ((strcmp (local->loc.path, "/") == 0) - && (strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR) == 0))) { - - gf_log (this->name, GF_LOG_TRACE, - "skipping inspection of %s under %s", - entry->d_name, local->loc.path); + if (can_skip_entry_self_heal (entry->d_name, &local->loc)) { op_ret = 0; goto out; } gf_log (this->name, GF_LOG_TRACE, - "inspecting existance of %s under %s", + "inspecting existence of %s under %s", entry->d_name, local->loc.path); - impunge_frame = copy_frame (frame); - if (!impunge_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - op_errno = ENOMEM; + ret = afr_impunge_frame_create (frame, this, active_src, + &impunge_frame); + if (ret) { + op_errno = -ret; goto out; } - ALLOC_OR_GOTO (impunge_local, afr_local_t, out); - - impunge_frame->local = impunge_local; + impunge_local = impunge_frame->local; impunge_sh = &impunge_local->self_heal; - impunge_sh->sh_frame = frame; - impunge_sh->active_source = active_src; - impunge_sh->impunge_ret_child = active_src; - - impunge_sh->impunging_entry_mode = - st_mode_from_ia (entry->d_stat.ia_prot, entry->d_stat.ia_type); - - ret = build_child_loc (this, &impunge_local->loc, &local->loc, entry->d_name); + ret = afr_build_child_loc (this, &impunge_local->loc, &local->loc, + entry->d_name); + loc_copy (&impunge_sh->parent_loc, &local->loc); if (ret != 0) { op_errno = ENOMEM; goto out; } - for (i = 0; i < priv->child_count; i++) { - if (i == active_src) - continue; - if (local->child_up[i] == 0) - continue; - if (sh->sources[i] == 1) - continue; - call_count++; - } - - impunge_local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (i == active_src) - continue; - if (local->child_up[i] == 0) - continue; - if (sh->sources[i] == 1) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", impunge_local->loc.path, - priv->children[i]->name); - - STACK_WIND_COOKIE (impunge_frame, - afr_sh_entry_impunge_entry_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - &impunge_local->loc, 0); - - if (!--call_count) - break; - } + afr_sh_common_lookup (impunge_frame, this, &impunge_local->loc, + afr_sh_entry_common_lookup_done, NULL, + AFR_LOOKUP_FAIL_CONFLICTS, NULL); - ret = 0; + op_ret = 0; out: - if (ret == -1) - sh->impunge_done (frame, this, active_src, op_ret, op_errno); + if (ret) { + if (impunge_frame) + AFR_STACK_DESTROY (impunge_frame); + sh->impunge_done (frame, this, op_ret, op_errno); + } return 0; } @@ -1930,7 +1954,7 @@ int afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) + gf_dirent_t *entries, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -1953,6 +1977,7 @@ afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, local->loc.path, priv->children[active_src]->name, strerror (op_errno)); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } else { gf_log (this->name, GF_LOG_TRACE, "readdir of %s on subvolume %s complete", @@ -1969,7 +1994,7 @@ afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, entry_count++; } - gf_log (this->name, GF_LOG_TRACE, + gf_log (this->name, GF_LOG_DEBUG, "readdir'ed %d entries from %s", entry_count, priv->children[active_src]->name); @@ -1985,21 +2010,24 @@ afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, int -afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src) +afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; + int32_t active_src = 0; priv = this->private; local = frame->local; sh = &local->self_heal; + active_src = sh->active_source; + gf_log (this->name, GF_LOG_DEBUG, "%s: readdir from offset %zd", + local->loc.path, sh->offset); STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk, priv->children[active_src], priv->children[active_src]->fops->readdirp, - sh->healing_fd, sh->block_size, sh->offset); + sh->healing_fd, sh->block_size, sh->offset, NULL); return 0; } @@ -2022,7 +2050,7 @@ afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) active_src = next_active_source (frame, this, sh->active_source); sh->active_source = active_src; - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { afr_sh_entry_finish (frame, this); return 0; } @@ -2037,7 +2065,7 @@ afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) "impunging entries of %s on %s to other sinks", local->loc.path, priv->children[active_src]->name); - afr_sh_entry_impunge_subvol (frame, this, active_src); + afr_sh_entry_impunge_subvol (frame, this); return 0; } @@ -2045,7 +2073,7 @@ afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) int afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; @@ -2071,7 +2099,7 @@ afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } } UNLOCK (&frame->lock); @@ -2079,7 +2107,7 @@ afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { afr_sh_entry_finish (frame, this); return 0; } @@ -2117,7 +2145,7 @@ afr_sh_entry_open (call_frame_t *frame, xlator_t *this) source = local->self_heal.source; sources = local->self_heal.sources; - sh->block_size = 65536; //131072 + sh->block_size = priv->sh_readdir_size; sh->offset = 0; call_count = sh->active_sinks; @@ -2139,7 +2167,7 @@ afr_sh_entry_open (call_frame_t *frame, xlator_t *this) (void *) (long) source, priv->children[source], priv->children[source]->fops->opendir, - &local->loc, fd); + &local->loc, fd, NULL); call_count--; } @@ -2156,7 +2184,7 @@ afr_sh_entry_open (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->opendir, - &local->loc, fd); + &local->loc, fd, NULL); if (!--call_count) break; @@ -2211,26 +2239,36 @@ afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this) "merging all entries as a conservative decision", local->loc.path); + sh->actual_sh_started = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); afr_sh_entry_open (frame, this); return 0; } -int -afr_sh_entry_fix (call_frame_t *frame, xlator_t *this) +void +afr_sh_entry_fix (call_frame_t *frame, xlator_t *this, + int32_t op_ret, int32_t op_errno) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; int source = 0; - - int nsources = 0; + int nsources = 0; + int32_t subvol_status = 0; local = frame->local; sh = &local->self_heal; priv = this->private; + if (op_ret < 0) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_set_error (sh, op_errno); + afr_sh_entry_finish (frame, this); + goto out; + } + if (sh->forced_merge) { sh->source = -1; goto heal; @@ -2239,56 +2277,39 @@ afr_sh_entry_fix (call_frame_t *frame, xlator_t *this) nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, sh->sources, sh->success_children, - AFR_ENTRY_TRANSACTION); - if (nsources == 0) { + AFR_ENTRY_TRANSACTION, &subvol_status, + _gf_true); + if ((subvol_status & ALL_FOOLS) || + (subvol_status & SPLIT_BRAIN)) { + gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative " + "merge", local->loc.path); + source = -1; + memset (sh->sources, 0, + sizeof (*sh->sources) * priv->child_count); + } else if (nsources == 0) { gf_log (this->name, GF_LOG_TRACE, "No self-heal needed for %s", local->loc.path); afr_sh_entry_finish (frame, this); - return 0; + return; + } else { + source = afr_sh_select_source (sh->sources, priv->child_count); } - source = afr_sh_select_source (sh->sources, priv->child_count); - sh->source = source; afr_reset_children (sh->fresh_children, priv->child_count); afr_get_fresh_children (sh->success_children, sh->sources, sh->fresh_children, priv->child_count); - afr_inode_set_read_ctx (this, sh->inode, sh->source, - sh->fresh_children); - + if (sh->source >= 0) + afr_inode_set_read_ctx (this, sh->inode, sh->source, + sh->fresh_children); heal: afr_sh_entry_sync_prepare (frame, this); - - return 0; -} - - - -int -afr_sh_entry_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - int call_count = 0; - afr_local_t *local = NULL; - - local = frame->local; - afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, - op_errno, inode, buf, xattr, - postparent, &local->loc); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_entry_fix (frame, this); - } - - return 0; +out: + return; } int @@ -2305,14 +2326,17 @@ afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this) if (int_lock->lock_op_ret < 0) { gf_log (this->name, GF_LOG_ERROR, "Non Blocking entrylks " "failed for %s.", local->loc.path); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_entry_done (frame, this); } else { gf_log (this->name, GF_LOG_DEBUG, "Non Blocking entrylks done " "for %s. Proceeding to FOP", local->loc.path); afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_entry_lookup_cbk, _gf_false); + afr_sh_entry_fix, NULL, + AFR_LOOKUP_FAIL_CONFLICTS | + AFR_LOOKUP_FAIL_MISSING_GFIDS, + NULL); } return 0; @@ -2321,14 +2345,18 @@ afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this) int afr_self_heal_entry (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; afr_private_t *priv = NULL; - + afr_self_heal_t *sh = NULL; priv = this->private; local = frame->local; + sh = &local->self_heal; + + sh->sh_type_in_action = AFR_SELF_HEAL_ENTRY; if (local->self_heal.do_entry_self_heal && priv->entry_self_heal) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); afr_sh_entrylk (frame, this, &local->loc, NULL, afr_sh_post_nonblocking_entry_cbk); } else { diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index 9999fdcdb..fd5da6cfd 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -54,60 +45,26 @@ afr_sh_metadata_done (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; local = frame->local; sh = &local->self_heal; - priv = this->private; -// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); - memset (sh->buf, 0, sizeof (struct iatt) * priv->child_count); - memset (sh->success, 0, sizeof (*sh->success) * priv->child_count); - - afr_reset_xattr (sh->xattr, priv->child_count); - if (local->govinda_gOvinda) { - gf_log (this->name, GF_LOG_INFO, - "split-brain detected, aborting selfheal of %s", + afr_sh_reset (frame, this); + if (IA_ISDIR (sh->type)) { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to entry check on %s", local->loc.path); - sh->op_failed = 1; - sh->completion_cbk (frame, this); + afr_self_heal_entry (frame, this); } else { - if (IA_ISREG (sh->type)) { - gf_log (this->name, GF_LOG_DEBUG, - "proceeding to data check on %s", - local->loc.path); - afr_self_heal_data (frame, this); - return 0; - } - - if (IA_ISDIR (sh->type)) { - gf_log (this->name, GF_LOG_DEBUG, - "proceeding to entry check on %s", - local->loc.path); - afr_self_heal_entry (frame, this); - return 0; - } - sh->completion_cbk (frame, this); + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to data check on %s", + local->loc.path); + afr_self_heal_data (frame, this); } return 0; } - -int -afr_sh_metadata_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - int call_count = 0; - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_metadata_done (frame, this); - - return 0; -} - int afr_sh_inode_unlock (call_frame_t *frame, xlator_t *this) { @@ -131,11 +88,24 @@ afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this) return 0; } +int +afr_sh_metadata_fail (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_metadata_finish (frame, this); + return 0; +} int afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) + int32_t op_errno, dict_t *xattr, dict_t *xdata) { afr_local_t *local = NULL; int call_count = 0; @@ -167,85 +137,19 @@ afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie, return 0; } - int afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, - sh->success, priv->child_count, - AFR_METADATA_TRANSACTION); - - erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, - gf_afr_mt_dict_t); - if (!erase_xattr) - return -ENOMEM; - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; - - erase_xattr[i] = get_new_dict(); - dict_ref (erase_xattr[i]); - } - } - - afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, - priv->child_count, AFR_METADATA_TRANSACTION); - - local->call_count = call_count; - - if (call_count == 0) { - gf_log (this->name, GF_LOG_INFO, - "metadata of %s not healed on any subvolume", - local->loc.path); - - afr_sh_metadata_finish (frame, this); - } - - for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "erasing pending flags from %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_erase_pending_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, erase_xattr[i]); - if (!--call_count) - break; - } - - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); - } - } - GF_FREE (erase_xattr); - - return 0; + afr_sh_erase_pending (frame, this, AFR_METADATA_TRANSACTION, + afr_sh_metadata_erase_pending_cbk, + afr_sh_metadata_finish); + return 0; } int afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; @@ -276,8 +180,13 @@ afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); - if (call_count == 0) + if (call_count == 0) { + if (local->xattr_req) { + dict_unref (local->xattr_req); + local->xattr_req = NULL; + } afr_sh_metadata_erase_pending (frame, this); + } return 0; } @@ -286,9 +195,9 @@ afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int afr_sh_metadata_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) + struct iatt *preop, struct iatt *postop, dict_t *xdata) { - afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); + afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno, xdata); return 0; } @@ -296,13 +205,93 @@ afr_sh_metadata_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); + afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno, xdata); return 0; } +int +afr_sh_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t *xdata) +{ + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (op_ret < 0) { + afr_sh_metadata_sync_cbk (frame, cookie, + this, -1, op_errno, xdata); + goto out; + } + + i = (long) cookie; + + STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->setxattr, + &local->loc, local->xattr_req, 0, NULL); + + out: + return 0; +} + +inline void +afr_prune_special_keys (dict_t *xattr_dict) +{ + dict_del (xattr_dict, GF_SELINUX_XATTR_KEY); +} + +inline void +afr_prune_pending_keys (dict_t *xattr_dict, afr_private_t *priv) +{ + int i = 0; + + for (; i < priv->child_count; i++) { + dict_del (xattr_dict, priv->pending_key[i]); + } +} + +int +afr_sh_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) +{ + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (op_ret < 0) { + afr_sh_metadata_sync_cbk (frame, cookie, + this, -1, op_errno, xdata); + goto out; + } + + afr_prune_pending_keys (xattr, priv); + + afr_prune_special_keys (xattr); + + i = (long) cookie; + + /* send removexattr in bulk via xdata */ + STACK_WIND_COOKIE (frame, afr_sh_removexattr_cbk, + cookie, + priv->children[i], + priv->children[i]->fops->removexattr, + &local->loc, "", xattr); + + out: + return 0; +} int afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) @@ -328,9 +317,10 @@ afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) /* * 2 calls per sink - setattr, setxattr */ - if (xattr) + if (xattr) { call_count = active_sinks * 2; - else + local->xattr_req = dict_ref (xattr); + } else call_count = active_sinks; local->call_count = call_count; @@ -366,18 +356,18 @@ afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) (void *) (long) i, priv->children[i], priv->children[i]->fops->setattr, - &local->loc, &stbuf, valid); + &local->loc, &stbuf, valid, NULL); call_count--; if (!xattr) continue; - STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk, + STACK_WIND_COOKIE (frame, afr_sh_getxattr_cbk, (void *) (long) i, priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, xattr, 0); + priv->children[i]->fops->getxattr, + &local->loc, NULL, NULL); call_count--; } @@ -386,17 +376,15 @@ afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) int -afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) +afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; int source = 0; - int i; - local = frame->local; sh = &local->self_heal; priv = this->private; @@ -411,16 +399,147 @@ afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, afr_sh_metadata_sync (frame, this, NULL); } else { - for (i = 0; i < priv->child_count; i++) { - dict_del (xattr, priv->pending_key[i]); - } - + afr_prune_pending_keys (xattr, priv); afr_sh_metadata_sync (frame, this, xattr); } return 0; } +static void +afr_set_metadata_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, + xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = 0; + char num[1024] = {0}; + size_t len = 0; + char *string = NULL; + size_t off = 0; + char *source_child = " from source %s to"; + char *format = " %s, "; + char *string_msg = " metadata self heal"; + char *pending_matrix_str = NULL; + int down_child_present = 0; + int unknown_child_present = 0; + char *down_subvol_1 = " down subvolume is "; + char *unknown_subvol_1 = " unknown subvolume is"; + char *down_subvol_2 = " down subvolumes are "; + char *unknown_subvol_2 = " unknown subvolumes are "; + int down_count = 0; + int unknown_count = 0; + + priv = this->private; + + pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix, + this); + + if (!pending_matrix_str) + pending_matrix_str = ""; + + len += snprintf (num, sizeof (num), "%s", string_msg); + + for (i = 0; i < priv->child_count; i++) { + if ((sh->source == i) && (local->child_up[i] == 1)) { + len += snprintf (num, sizeof (num), source_child, + priv->children[i]->name); + } else if ((local->child_up[i] == 1) && (sh->sources[i] == 0)) { + len += snprintf (num, sizeof (num), format, + priv->children[i]->name); + } else if (local->child_up[i] == 0) { + len += snprintf (num, sizeof (num), format, + priv->children[i]->name); + if (!down_child_present) + down_child_present = 1; + down_count++; + } else if (local->child_up[i] == -1) { + len += snprintf (num, sizeof (num), format, + priv->children[i]->name); + if (!unknown_child_present) + unknown_child_present = 1; + unknown_count++; + } + } + + if (down_child_present) { + if (down_count > 1) { + len += snprintf (num, sizeof (num), "%s", + down_subvol_2); + } else { + len += snprintf (num, sizeof (num), "%s", + down_subvol_1); + } + } + if (unknown_child_present) { + if (unknown_count > 1) { + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_2); + } else { + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_1); + } + } + + len ++; + + string = GF_CALLOC (len, sizeof (char), gf_common_mt_char); + if (!string) + return; + + off += snprintf (string + off, len - off, "%s", string_msg); + for (i=0; i < priv->child_count; i++) { + if ((sh->source == i) && (local->child_up[i] == 1)) + off += snprintf (string + off, len - off, source_child, + priv->children[i]->name); + } + + for (i = 0; i < priv->child_count; i++) { + if ((local->child_up[i] == 1)&& (sh->sources[i] == 0)) + off += snprintf (string + off, len - off, format, + priv->children[i]->name); + } + + if (down_child_present) { + if (down_count > 1) { + off += snprintf (string + off, len - off, "%s", + down_subvol_2); + } else { + off += snprintf (string + off, len - off, "%s", + down_subvol_1); + } + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 0) + off += snprintf (string + off, len - off, format, + priv->children[i]->name); + } + + if (unknown_child_present) { + if (unknown_count > 1) { + off += snprintf (string + off, len - off, "%s", + unknown_subvol_2); + } else { + off += snprintf (string + off, len - off, "%s", + unknown_subvol_1); + } + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == -1) + off += snprintf (string + off, len - off, format, + priv->children[i]->name); + } + + gf_asprintf (&sh->metadata_sh_info, "%s metadata %s,", string, + pending_matrix_str); + + if (pending_matrix_str && strcmp (pending_matrix_str, "")) + GF_FREE (pending_matrix_str); + + if (string && strcmp (string, "")) + GF_FREE (string); +} int afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) @@ -450,17 +569,21 @@ afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) local->loc.path, priv->children[source]->name, sh->active_sinks); + sh->actual_sh_started = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); + afr_set_metadata_sh_info_str (local, sh, this); STACK_WIND (frame, afr_sh_metadata_getxattr_cbk, priv->children[source], priv->children[source]->fops->getxattr, - &local->loc, NULL); + &local->loc, NULL, NULL); return 0; } -int -afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this) +void +afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this, + int32_t op_ret, int32_t op_errno) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; @@ -473,19 +596,16 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; priv = this->private; + if (op_ret < 0) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_set_error (sh, op_errno); + afr_sh_metadata_finish (frame, this); + goto out; + } nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, sh->sources, sh->success_children, - AFR_METADATA_TRANSACTION); - if (nsources == 0) { - gf_log (this->name, GF_LOG_TRACE, - "No self-heal needed for %s", - local->loc.path); - - afr_sh_metadata_finish (frame, this); - return 0; - } - + AFR_METADATA_TRANSACTION, NULL, _gf_false); if ((nsources == -1) && (priv->favorite_child != -1) && (sh->child_errno[priv->favorite_child] == 0)) { @@ -502,15 +622,21 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this) } if (nsources == -1) { - gf_log (this->name, GF_LOG_ERROR, - "Unable to self-heal permissions/ownership of '%s' " - "(possible split-brain). Please fix the file on " - "all backend volumes", local->loc.path); + afr_sh_print_split_brain_log (sh->pending_matrix, this, + local->loc.path); + afr_set_split_brain (this, sh->inode, SPB, DONT_KNOW); + afr_sh_metadata_fail (frame, this); + goto out; + } - local->govinda_gOvinda = 1; + afr_set_split_brain (this, sh->inode, NO_SPB, DONT_KNOW); + if (nsources == 0) { + gf_log (this->name, GF_LOG_TRACE, + "No self-heal needed for %s", + local->loc.path); afr_sh_metadata_finish (frame, this); - return 0; + goto out; } source = afr_sh_select_source (sh->sources, priv->child_count); @@ -520,7 +646,7 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this) "No active sources found."); afr_sh_metadata_finish (frame, this); - return 0; + goto out; } sh->source = source; @@ -546,33 +672,12 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this) sh->fresh_children); } - afr_sh_metadata_sync_prepare (frame, this); - - return 0; -} - - -int -afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - afr_local_t *local = NULL; - int call_count = 0; - - local = frame->local; - - afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, - op_errno, inode, buf, xattr, - postparent, &local->loc); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_metadata_fix (frame, this); - - return 0; + if (sh->do_metadata_self_heal && priv->metadata_self_heal) + afr_sh_metadata_sync_prepare (frame, this); + else + afr_sh_metadata_finish (frame, this); +out: + return; } int @@ -586,9 +691,9 @@ afr_sh_metadata_post_nonblocking_inodelk_cbk (call_frame_t *frame, int_lock = &local->internal_lock; if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "Non Blocking metadata " + gf_log (this->name, GF_LOG_DEBUG, "Non Blocking metadata " "inodelks failed for %s.", local->loc.path); - gf_log (this->name, GF_LOG_ERROR, "Metadata self-heal " + gf_log (this->name, GF_LOG_DEBUG, "Metadata self-heal " "failed for %s.", local->loc.path); afr_sh_metadata_done (frame, this); } else { @@ -597,7 +702,10 @@ afr_sh_metadata_post_nonblocking_inodelk_cbk (call_frame_t *frame, "inodelks done for %s. Proceeding to FOP", local->loc.path); afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_metadata_lookup_cbk, _gf_false); + afr_sh_metadata_fix, NULL, + AFR_LOOKUP_FAIL_CONFLICTS | + AFR_LOOKUP_FAIL_MISSING_GFIDS, + NULL); } return 0; @@ -607,19 +715,22 @@ int afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; local = frame->local; int_lock = &local->internal_lock; + int_lock->domain = this->name; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); int_lock->transaction_lk_type = AFR_SELFHEAL_LK; int_lock->selfheal_lk_type = AFR_METADATA_SELF_HEAL_LK; afr_set_lock_number (frame, this); - int_lock->lk_flock.l_start = 0; - int_lock->lk_flock.l_len = 0; - int_lock->lk_flock.l_type = F_WRLCK; + inodelk->flock.l_start = LLONG_MAX - 1; + inodelk->flock.l_len = 0; + inodelk->flock.l_type = F_WRLCK; int_lock->lock_cbk = afr_sh_metadata_post_nonblocking_inodelk_cbk; afr_nonblocking_inodelk (frame, this); @@ -627,17 +738,29 @@ afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) return 0; } +gf_boolean_t +afr_can_start_metadata_self_heal (afr_self_heal_t *sh, afr_private_t *priv) +{ + if (sh->force_confirm_spb) + return _gf_true; + if (sh->do_metadata_self_heal && priv->metadata_self_heal) + return _gf_true; + return _gf_false; +} int afr_self_heal_metadata (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = this->private; - + afr_self_heal_t *sh = &local->self_heal; local = frame->local; + sh = &local->self_heal; + sh->sh_type_in_action = AFR_SELF_HEAL_METADATA; - if (local->self_heal.do_metadata_self_heal && priv->metadata_self_heal) { + if (afr_can_start_metadata_self_heal (sh, priv)) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); afr_sh_metadata_lock (frame, this); } else { afr_sh_metadata_done (frame, this); diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index f40c06faa..7c9bc8111 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __AFR_SELF_HEAL_H__ @@ -30,13 +21,6 @@ #define SIZE_GREATER(buf1,buf2) ((buf1)->ia_size > (buf2)->ia_size) int -afr_sh_has_metadata_pending (dict_t *xattr, xlator_t *this); -int -afr_sh_has_entry_pending (dict_t *xattr, xlator_t *this); -int -afr_sh_has_data_pending (dict_t *xattr, xlator_t *this); - -int afr_self_heal_entry (call_frame_t *frame, xlator_t *this); int @@ -54,5 +38,6 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode); int afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, dict_t **xattr, - afr_transaction_type txn_type); + afr_transaction_type txn_type, + uuid_t gfid); #endif /* __AFR_SELF_HEAL_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index d27d9e09b..1b48a1bca 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -24,6 +15,1076 @@ #include "afr.h" #include "syncop.h" #include "afr-self-heald.h" +#include "afr-self-heal-common.h" +#include "protocol-common.h" +#include "event-history.h" + +typedef enum { + STOP_CRAWL_ON_SINGLE_SUBVOL = 1 +} afr_crawl_flags_t; + +typedef enum { + HEAL = 1, + INFO, + STATISTICS_TO_BE_HEALED, +} shd_crawl_op; + +typedef struct shd_dump { + dict_t *dict; + xlator_t *this; + int child; +} shd_dump_t; + +typedef struct shd_event_ { + int child; + char *path; +} shd_event_t; + +typedef struct shd_pos_ { + int child; + xlator_t *this; + afr_child_pos_t pos; +} shd_pos_t; + +typedef int +(*afr_crawl_done_cbk_t) (int ret, call_frame_t *sync_frame, void *crawl_data); + +void +afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl, + process_entry_cbk_t process_entry, void *op_data, + gf_boolean_t exclusive, int crawl_flags, + afr_crawl_done_cbk_t crawl_done); + +static int +_crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data); + +/* For calling straight through (e.g. already in a synctask). */ +int +afr_find_child_position (xlator_t *this, int child, afr_child_pos_t *pos); + +/* For deferring through a new synctask. */ +int +afr_syncop_find_child_position (void *data); + +static int +_loc_assign_gfid_path (loc_t *loc) +{ + int ret = -1; + char gfid_path[64] = {0}; + + if (loc->inode && !uuid_is_null (loc->inode->gfid)) { + ret = inode_path (loc->inode, NULL, (char**)&loc->path); + } else if (!uuid_is_null (loc->gfid)) { + snprintf (gfid_path, sizeof (gfid_path), "<gfid:%s>", + uuid_utoa (loc->gfid)); + loc->path = gf_strdup (gfid_path); + if (loc->path) + ret = 0; + } + return ret; +} + +void +_destroy_crawl_event_data (void *data) +{ + shd_crawl_event_t *crawl_event = NULL; + + if (!data) + goto out; + + crawl_event = (shd_crawl_event_t *)data; + GF_FREE (crawl_event->start_time_str); + GF_FREE (crawl_event->end_time_str); + +out: + return; +} + +void +_destroy_shd_event_data (void *data) +{ + shd_event_t *event = NULL; + if (!data) + goto out; + event = (shd_event_t*)data; + GF_FREE (event->path); +out: + return; +} +void +shd_cleanup_event (void *event) +{ + shd_event_t *shd_event = event; + + if (!shd_event) + goto out; + GF_FREE (shd_event->path); + GF_FREE (shd_event); +out: + return; +} + +int +afr_get_local_child (afr_self_heald_t *shd, unsigned int child_count) +{ + int i = 0; + int ret = -1; + for (i = 0; i < child_count; i++) { + if (shd->pos[i] == AFR_POS_LOCAL) { + ret = i; + break; + } + } + return ret; +} + +static int +_build_index_loc (xlator_t *this, loc_t *loc, char *name, loc_t *parent) +{ + int ret = 0; + + uuid_copy (loc->pargfid, parent->inode->gfid); + loc->path = ""; + loc->name = name; + loc->parent = inode_ref (parent->inode); + if (!loc->parent) { + loc->path = NULL; + loc_wipe (loc); + ret = -1; + } + return ret; +} + +int +_add_crawl_stats_to_dict (xlator_t *this, dict_t *output, int child, + shd_crawl_event_t *shd_event, struct timeval *tv) +{ + int ret = 0; + uint64_t count = 0; + char key[256] = {0}; + int xl_id = 0; + uint64_t healed_count = 0; + uint64_t split_brain_count = 0; + uint64_t heal_failed_count = 0; + char *start_time_str = NULL; + char *end_time_str = NULL; + char *crawl_type = NULL; + int progress = -1; + + healed_count = shd_event->healed_count; + split_brain_count = shd_event->split_brain_count; + heal_failed_count = shd_event->heal_failed_count; + start_time_str = shd_event->start_time_str; + end_time_str = shd_event->end_time_str; + crawl_type = shd_event->crawl_type; + + if (!start_time_str) { + ret = -1; + goto out; + } + + + ret = dict_get_int32 (output, this->name, &xl_id); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "xl does not have id"); + goto out; + } + + snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child); + ret = dict_get_uint64 (output, key, &count); + + snprintf (key, sizeof (key), "statistics_healed_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64(output, key, healed_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "healed_count to outout"); + goto out; + } + snprintf (key, sizeof (key), "statistics_sb_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64 (output, key, split_brain_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "split_brain_count to outout"); + goto out; + } + snprintf (key, sizeof (key), "statistics_crawl_type-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_dynstr (output, key, gf_strdup (crawl_type)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "crawl_type to output"); + goto out; + } + snprintf (key, sizeof (key), "statistics_heal_failed_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64 (output, key, heal_failed_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "healed_failed_count to outout"); + goto out; + } + snprintf (key, sizeof (key), "statistics_strt_time-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_dynstr (output, key, gf_strdup(start_time_str)); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "crawl_start_time to outout"); + goto out; + } + + snprintf (key, sizeof (key), "statistics_end_time-%d-%d-%"PRIu64, + xl_id, child, count); + + if (!end_time_str) + end_time_str = "Could not determine the end time"; + ret = dict_set_dynstr (output, key, gf_strdup(end_time_str)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "crawl_end_time to outout"); + goto out; + } + snprintf (key, sizeof (key), "statistics_inprogress-%d-%d-%"PRIu64, + xl_id, child, count); + + if (shd_event->crawl_inprogress == _gf_true) + progress = 1; + else + progress = 0; + + ret = dict_set_int32 (output, key, progress); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "inprogress to outout"); + goto out; + } + + snprintf (key, sizeof (key), "statistics-%d-%d-count",xl_id, child); + ret = dict_set_uint64 (output, key, count + 1); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not increment the " + "counter."); + goto out; + } +out: + return ret; +} + +int +_add_path_to_dict (xlator_t *this, dict_t *output, int child, char *path, + struct timeval *tv, gf_boolean_t dyn) +{ + //subkey not used for now + int ret = -1; + uint64_t count = 0; + char key[256] = {0}; + int xl_id = 0; + + ret = dict_get_int32 (output, this->name, &xl_id); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "xl does not have id"); + goto out; + } + + snprintf (key, sizeof (key), "%d-%d-count", xl_id, child); + ret = dict_get_uint64 (output, key, &count); + + snprintf (key, sizeof (key), "%d-%d-%"PRIu64, xl_id, child, count); + if (dyn) + ret = dict_set_dynstr (output, key, path); + else + ret = dict_set_str (output, key, path); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s: Could not add to output", + path); + goto out; + } + + if (!tv) + goto inc_count; + snprintf (key, sizeof (key), "%d-%d-%"PRIu64"-time", xl_id, + child, count); + ret = dict_set_uint32 (output, key, tv->tv_sec); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s: Could not set time", + path); + goto out; + } + +inc_count: + snprintf (key, sizeof (key), "%d-%d-count", xl_id, child); + ret = dict_set_uint64 (output, key, count + 1); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not increment count"); + goto out; + } + ret = 0; +out: + return ret; +} + +int +_get_path_from_gfid_loc (xlator_t *this, xlator_t *readdir_xl, loc_t *child, + char **fpath, gf_boolean_t *missing) +{ + dict_t *xattr = NULL; + char *path = NULL; + int ret = -1; + + ret = syncop_getxattr (readdir_xl, child, &xattr, GFID_TO_PATH_KEY); + if (ret < 0) { + if ((errno == ENOENT) && missing) + *missing = _gf_true; + goto out; + } + ret = dict_get_str (xattr, GFID_TO_PATH_KEY, &path); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get path for " + "gfid %s", uuid_utoa (child->gfid)); + goto out; + } + path = gf_strdup (path); + if (!path) { + ret = -1; + goto out; + } + ret = 0; +out: + if (!ret) + *fpath = path; + if (xattr) + dict_unref (xattr); + return ret; +} + +int +_add_event_to_dict (circular_buffer_t *cb, void *data) +{ + int ret = 0; + shd_dump_t *dump_data = NULL; + shd_event_t *shd_event = NULL; + + dump_data = data; + shd_event = cb->data; + if (shd_event->child != dump_data->child) + goto out; + ret = _add_path_to_dict (dump_data->this, dump_data->dict, + dump_data->child, shd_event->path, &cb->tv, + _gf_false); +out: + return ret; +} + +int +_add_crawl_event_statistics_to_dict (circular_buffer_t *cb, void *data) +{ + int ret = 0; + shd_dump_t *dump_data = NULL; + shd_crawl_event_t *shd_event = NULL; + + dump_data = data; + shd_event = cb->data; + ret = _add_crawl_stats_to_dict (dump_data->this, dump_data->dict, + dump_data->child, shd_event, &cb->tv); + return ret; +} + +int +_add_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict, int child) +{ + shd_dump_t dump_data = {0}; + + dump_data.this = this; + dump_data.dict = dict; + dump_data.child = child; + eh_dump (eh, &dump_data, _add_event_to_dict); + return 0; +} + + +int +_add_statistics_to_dict (xlator_t *this, dict_t *dict, int child) +{ + shd_dump_t dump_data = {0}; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + + priv = this->private; + shd = &priv->shd; + + dump_data.this = this; + dump_data.dict = dict; + dump_data.child = child; + eh_dump (shd->statistics[child], &dump_data, + _add_crawl_event_statistics_to_dict); + return 0; + +} + +void +_remove_stale_index (xlator_t *this, xlator_t *readdir_xl, + loc_t *parent, char *fname) +{ + int ret = 0; + loc_t index_loc = {0}; + + ret = _build_index_loc (this, &index_loc, fname, parent); + if (ret) + goto out; + gf_log (this->name, GF_LOG_DEBUG, "Removing stale index " + "for %s on %s", index_loc.name, readdir_xl->name); + ret = syncop_unlink (readdir_xl, &index_loc); + if(ret && (errno != ENOENT)) { + gf_log(this->name, GF_LOG_ERROR, "%s: Failed to remove index " + "on %s - %s",index_loc.name, readdir_xl->name, + strerror (errno)); + } + index_loc.path = NULL; + loc_wipe (&index_loc); +out: + return; +} + +int +_count_hard_links_under_base_indices_dir (xlator_t *this, + afr_crawl_data_t *crawl_data, + gf_dirent_t *entry, loc_t *childloc, + loc_t *parentloc, struct iatt *iattr) +{ + xlator_t *readdir_xl = crawl_data->readdir_xl; + struct iatt parent = {0}; + int ret = 0; + dict_t *output = NULL; + int xl_id = 0; + char key[256] = {0}; + int child = -1; + uint64_t hardlinks = 0; + + output = crawl_data->op_data; + child = crawl_data->child; + + ret = syncop_lookup (readdir_xl, childloc, NULL, iattr, NULL, &parent); + if (ret) + goto out; + + ret = dict_get_int32 (output, this->name, &xl_id); + if (ret) + goto out; + + snprintf (key, sizeof (key), "%d-%d-hardlinks", xl_id, child); + ret = dict_get_uint64 (output, key, &hardlinks); + + /*Removing the count of base_entry under indices/base_indicies and + * entry under indices/xattrop */ + hardlinks = hardlinks + iattr->ia_nlink - 2; + ret = dict_set_uint64 (output, key, hardlinks); + if (ret) + goto out; + +out: + return ret; +} + +int +_add_summary_to_dict (xlator_t *this, afr_crawl_data_t *crawl_data, + gf_dirent_t *entry, + loc_t *childloc, loc_t *parentloc, struct iatt *iattr) +{ + dict_t *output = NULL; + xlator_t *readdir_xl = NULL; + int ret = -1; + char *path = NULL; + gf_boolean_t missing = _gf_false; + char gfid_str[64] = {0}; + + if (uuid_is_null (childloc->gfid)) + goto out; + + output = crawl_data->op_data; + readdir_xl = crawl_data->readdir_xl; + + ret = _get_path_from_gfid_loc (this, readdir_xl, childloc, &path, + &missing); + if (ret == 0) { + ret = _add_path_to_dict (this, output, crawl_data->child, path, + NULL, _gf_true); + } else if (missing) { + _remove_stale_index (this, readdir_xl, parentloc, + uuid_utoa_r (childloc->gfid, gfid_str)); + } + +out: + if (ret && path) + GF_FREE (path); + return ret; +} + +void +_crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child, + int32_t op_ret, int32_t op_errno, dict_t *xattr_rsp, + afr_crawl_data_t *crawl_data) +{ + int ret = 0; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + eh_t *eh = NULL; + char *path = NULL; + char gfid_str[64] = {0}; + shd_event_t *event = NULL; + int32_t sh_failed = 0; + gf_boolean_t split_brain = 0; + int32_t actual_sh_done = 0; + shd_crawl_event_t **shd_crawl_event = NULL; + + priv = this->private; + shd = &priv->shd; + if (crawl_data->crawl == INDEX) { + if ((op_ret < 0) && (op_errno == ENOENT)) { + _remove_stale_index (this, crawl_data->readdir_xl, + parent, uuid_utoa_r (child->gfid, + gfid_str)); + goto out; + } + ret = _get_path_from_gfid_loc (this, crawl_data->readdir_xl, + child, &path, NULL); + if (ret) + goto out; + } else { + path = gf_strdup (child->path); + if (!path) { + ret = -1; + goto out; + } + } + + if (xattr_rsp) { + ret = dict_get_int32 (xattr_rsp, "sh-failed", &sh_failed); + ret = dict_get_int32 (xattr_rsp, "actual-sh-done", &actual_sh_done); + } + + shd_crawl_event = (shd_crawl_event_t**)(shd->crawl_events); + + split_brain = afr_is_split_brain (this, child->inode); + if ((op_ret < 0 && op_errno == EIO) || split_brain) { + eh = shd->split_brain; + shd_crawl_event[crawl_data->child]->split_brain_count += 1; + } else if ((op_ret < 0) || sh_failed) { + eh = shd->heal_failed; + shd_crawl_event[crawl_data->child]->heal_failed_count += 1; + } else if (actual_sh_done == 1) { + eh = shd->healed; + shd_crawl_event[crawl_data->child]->healed_count += 1; + } + ret = -1; + + if (eh != NULL) { + event = GF_CALLOC (1, sizeof (*event), gf_afr_mt_shd_event_t); + if (!event) + goto out; + event->child = crawl_data->child; + event->path = path; + + ret = eh_save_history (eh, event); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "%s:Failed to save " + "to event history, (%d, %s)", path, op_ret, + strerror (op_errno)); + + goto out; + } + } else { + gf_log (this->name, GF_LOG_DEBUG, "%s:Self heal already done ", + path); + + } + ret = 0; +out: + if (ret && path) + GF_FREE (path); + return; +} + +int +_link_inode_update_loc (xlator_t *this, loc_t *loc, struct iatt *iattr) +{ + inode_t *link_inode = NULL; + int ret = -1; + + link_inode = inode_link (loc->inode, NULL, NULL, iattr); + if (link_inode == NULL) { + gf_log (this->name, GF_LOG_ERROR, "inode link failed " + "on the inode (%s)", uuid_utoa (iattr->ia_gfid)); + goto out; + } + inode_unref (loc->inode); + loc->inode = link_inode; + ret = 0; +out: + return ret; +} + +int +_self_heal_entry (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *entry, + loc_t *child, loc_t *parent, struct iatt *iattr) +{ + struct iatt parentbuf = {0}; + int ret = 0; + dict_t *xattr_rsp = NULL; + dict_t *xattr_req = NULL; + + xattr_req = dict_new (); + if (!xattr_req) { + errno = ENOMEM; + ret = -1; + goto out; + } + + ret = dict_set_int32 (xattr_req, "allow-sh-for-running-transaction", 1); + + gf_log (this->name, GF_LOG_DEBUG, "lookup %s", child->path); + + ret = syncop_lookup (this, child, xattr_req, + iattr, &xattr_rsp, &parentbuf); + _crawl_post_sh_action (this, parent, child, ret, errno, xattr_rsp, + crawl_data); + if (xattr_rsp) + dict_unref (xattr_rsp); + if (ret == 0) + ret = _link_inode_update_loc (this, child, iattr); + +out: + if (xattr_req) + dict_unref(xattr_req); + return ret; +} + +static int +afr_crawl_done (int ret, call_frame_t *sync_frame, void *data) +{ + GF_FREE (data); + STACK_DESTROY (sync_frame->root); + return 0; +} + +void +_do_self_heal_on_subvol (xlator_t *this, int child, afr_crawl_type_t crawl) +{ + afr_start_crawl (this, child, crawl, _self_heal_entry, + NULL, _gf_true, STOP_CRAWL_ON_SINGLE_SUBVOL, + afr_crawl_done); +} + +gf_boolean_t +_crawl_proceed (xlator_t *this, int child, int crawl_flags, char **reason) +{ + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + gf_boolean_t proceed = _gf_false; + char *msg = NULL; + + priv = this->private; + shd = &priv->shd; + if (!shd->enabled) { + msg = "Self-heal daemon is not enabled"; + gf_log (this->name, GF_LOG_DEBUG, "%s", msg); + goto out; + } + if (!priv->child_up[child]) { + gf_log (this->name, GF_LOG_DEBUG, "Stopping crawl for %s , " + "subvol went down", priv->children[child]->name); + msg = "Brick is Not connected"; + goto out; + } + + if (crawl_flags & STOP_CRAWL_ON_SINGLE_SUBVOL) { + if (afr_up_children_count (priv->child_up, + priv->child_count) < 2) { + gf_log (this->name, GF_LOG_DEBUG, "Stopping crawl as " + "< 2 children are up"); + msg = "< 2 bricks in replica are running"; + goto out; + } + } + proceed = _gf_true; +out: + if (reason) + *reason = msg; + return proceed; +} + +int +_do_crawl_op_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl, + shd_crawl_op op, dict_t *output) +{ + afr_private_t *priv = NULL; + char *status = NULL; + char *subkey = NULL; + char key[256] = {0}; + shd_pos_t pos_data = {0}; + int op_ret = -1; + int xl_id = -1; + int i = 0; + int ret = 0; + int crawl_flags = 0; + + priv = this->private; + if (op == HEAL) + crawl_flags |= STOP_CRAWL_ON_SINGLE_SUBVOL; + + if (output) { + ret = dict_get_int32 (output, this->name, &xl_id); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Invalid input, " + "translator-id is not available"); + goto out; + } + } + pos_data.this = this; + subkey = "status"; + for (i = 0; i < priv->child_count; i++) { + if (_crawl_proceed (this, i, crawl_flags, &status)) { + pos_data.child = i; + /* + * We're already in a synctask in this case, so we + * don't need to defer through a second (and in fact + * that can cause deadlock). Just call straight + * through instead. + */ + ret = afr_find_child_position(pos_data.this, + pos_data.child, + &pos_data.pos); + if (ret) { + status = "Not able to find brick location"; + } else if (pos_data.pos == AFR_POS_REMOTE) { + status = "brick is remote"; + } else { + op_ret = 0; + if (op == HEAL) { + status = "Started self-heal"; + _do_self_heal_on_subvol (this, i, + crawl); + } else if (output && (op == INFO)) { + status = ""; + afr_start_crawl (this, i, INDEX, + _add_summary_to_dict, + output, _gf_false, 0, + NULL); + } else if (output && + (op == STATISTICS_TO_BE_HEALED)) { + status = ""; + afr_start_crawl (this, i, + INDEX_TO_BE_HEALED, + _count_hard_links_under_base_indices_dir, + output, _gf_false, + 0, NULL); + } + } + if (output) { + snprintf (key, sizeof (key), "%d-%d-%s", xl_id, + i, subkey); + ret = dict_set_str (output, key, status); + } + if (!op_ret && (crawl == FULL)) + break; + } + if (output) { + snprintf (key, sizeof (key), "%d-%d-%s", xl_id, i, + subkey); + ret = dict_set_str (output, key, status); + } + } +out: + return op_ret; +} + +int +_do_self_heal_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl, + dict_t *output) +{ + return _do_crawl_op_on_local_subvols (this, crawl, HEAL, output); +} + +int +_get_index_summary_on_local_subvols (xlator_t *this, dict_t *output) +{ + return _do_crawl_op_on_local_subvols (this, INDEX, INFO, output); +} + +void +afr_fill_completed_crawl_statistics_to_dict (xlator_t *this, dict_t *dict) +{ + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int i = 0; + priv = this->private; + shd= &priv->shd; + for (i = 0; i < priv->child_count; i++) { + if (shd->pos[i] != AFR_POS_LOCAL) + continue; + _add_statistics_to_dict (this, dict, i); + } + + return ; +} + +static void +reset_crawl_event (shd_crawl_event_t *crawl_event) +{ + crawl_event->healed_count = 0; + crawl_event->split_brain_count = 0; + crawl_event->heal_failed_count = 0; + GF_FREE (crawl_event->start_time_str); + crawl_event->start_time_str = NULL; + crawl_event->end_time_str = NULL; + crawl_event->crawl_type = NULL; + crawl_event->crawl_inprogress = _gf_false; + return; +} + +static void +afr_copy_crawl_event_struct (shd_crawl_event_t *src, shd_crawl_event_t *dst) +{ + dst->healed_count = src->healed_count; + dst->split_brain_count = src->split_brain_count; + dst->heal_failed_count = src->heal_failed_count; + dst->start_time_str = gf_strdup (src->start_time_str); + dst->end_time_str = "Crawl is already in progress"; + dst->crawl_type = src->crawl_type; + dst->crawl_inprogress = _gf_true; + return; +} + +static int +afr_fill_crawl_statistics_of_running_crawl(xlator_t *this, dict_t *dict) +{ + shd_crawl_event_t *evnt = NULL; + int ret = 0; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int i = 0; + priv = this->private; + shd = &priv->shd; + + evnt = GF_CALLOC (1, sizeof (shd_crawl_event_t), + gf_afr_mt_shd_crawl_event_t); + if (!evnt) { + ret = -1; + goto out; + } + LOCK (&priv->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (shd->pos[i] != AFR_POS_LOCAL) + continue; + + reset_crawl_event (evnt); + + if (!shd->crawl_events[i]) { + continue; + } + + afr_copy_crawl_event_struct (shd->crawl_events[i], + evnt); + _add_crawl_stats_to_dict (this, dict, i, evnt, NULL); + + } + } + UNLOCK (&priv->lock); + reset_crawl_event (evnt); + GF_FREE (evnt); + +out: + return ret; +} + +static int +_add_local_subvols_crawl_statistics_to_dict (xlator_t *this, dict_t *dict) +{ + int ret = 0; + afr_fill_completed_crawl_statistics_to_dict (this, dict); + ret = afr_fill_crawl_statistics_of_running_crawl (this, dict); + return ret; +} +int +_add_local_subvols_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict) +{ + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int i = 0; + + priv = this->private; + shd = &priv->shd; + + for (i = 0; i < priv->child_count; i++) { + if (shd->pos[i] != AFR_POS_LOCAL) + continue; + _add_eh_to_dict (this, eh, dict, i); + } + return 0; +} + +int +afr_xl_op (xlator_t *this, dict_t *input, dict_t *output) +{ + gf_xl_afr_op_t op = GF_AFR_OP_INVALID; + int ret = 0; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int xl_id = 0; + + priv = this->private; + shd = &priv->shd; + + ret = dict_get_int32 (input, "xl-op", (int32_t*)&op); + if (ret) + goto out; + ret = dict_get_int32 (input, this->name, &xl_id); + if (ret) + goto out; + ret = dict_set_int32 (output, this->name, xl_id); + if (ret) + goto out; + switch (op) { + case GF_AFR_OP_HEAL_INDEX: + ret = _do_self_heal_on_local_subvols (this, INDEX, output); + break; + case GF_AFR_OP_HEAL_FULL: + ret = _do_self_heal_on_local_subvols (this, FULL, output); + break; + case GF_AFR_OP_INDEX_SUMMARY: + (void)_get_index_summary_on_local_subvols (this, output); + ret = 0; + break; + case GF_AFR_OP_HEALED_FILES: + ret = _add_local_subvols_eh_to_dict (this, shd->healed, output); + break; + case GF_AFR_OP_HEAL_FAILED_FILES: + ret = _add_local_subvols_eh_to_dict (this, shd->heal_failed, + output); + break; + case GF_AFR_OP_SPLIT_BRAIN_FILES: + ret = _add_local_subvols_eh_to_dict (this, shd->split_brain, + output); + break; + case GF_AFR_OP_STATISTICS: + ret = _add_local_subvols_crawl_statistics_to_dict (this, output); + break; + case GF_AFR_OP_STATISTICS_HEAL_COUNT: + case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA: + ret = _do_crawl_op_on_local_subvols (this, INDEX_TO_BE_HEALED, + STATISTICS_TO_BE_HEALED, + output); + break; + default: + gf_log (this->name, GF_LOG_ERROR, "Unknown set op %d", op); + break; + } +out: + dict_del (output, this->name); + return ret; +} + +void +afr_poll_self_heal (void *data) +{ + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + struct timespec timeout = {0}; + xlator_t *this = NULL; + long child = (long)data; + gf_timer_t *old_timer = NULL; + gf_timer_t *new_timer = NULL; + shd_pos_t pos_data = {0}; + int ret = 0; + + this = THIS; + priv = this->private; + shd = &priv->shd; + + if (shd->pos[child] == AFR_POS_UNKNOWN) { + pos_data.this = this; + pos_data.child = child; + ret = synctask_new (this->ctx->env, + afr_syncop_find_child_position, + NULL, NULL, &pos_data); + if (!ret) + shd->pos[child] = pos_data.pos; + } + if (shd->enabled && (shd->pos[child] == AFR_POS_LOCAL)) + _do_self_heal_on_subvol (this, child, INDEX); + timeout.tv_sec = shd->timeout; + timeout.tv_nsec = 0; + //notify and previous timer should be synchronized. + LOCK (&priv->lock); + { + old_timer = shd->timer[child]; + if (shd->pos[child] == AFR_POS_REMOTE) + goto unlock; + shd->timer[child] = gf_timer_call_after (this->ctx, timeout, + afr_poll_self_heal, + data); + new_timer = shd->timer[child]; + } +unlock: + UNLOCK (&priv->lock); + + if (old_timer) + gf_timer_call_cancel (this->ctx, old_timer); + if (!new_timer && (shd->pos[child] != AFR_POS_REMOTE)) { + gf_log (this->name, GF_LOG_WARNING, + "Could not create self-heal polling timer for %s", + priv->children[child]->name); + } + return; +} + +static int +afr_handle_child_up (int ret, call_frame_t *sync_frame, void *data) +{ + afr_self_heald_t *shd = NULL; + shd_pos_t *pos_data = data; + afr_private_t *priv = NULL; + + if (ret) + goto out; + + priv = pos_data->this->private; + shd = &priv->shd; + shd->pos[pos_data->child] = pos_data->pos; + if (pos_data->pos != AFR_POS_REMOTE) + afr_poll_self_heal ((void*)(long)pos_data->child); + _do_self_heal_on_local_subvols (THIS, INDEX, NULL); +out: + GF_FREE (data); + return 0; +} + +void +afr_proactive_self_heal (void *data) +{ + xlator_t *this = NULL; + long child = (long)data; + shd_pos_t *pos_data = NULL; + int ret = 0; + + this = THIS; + + //Position of brick could have changed and it could be local now. + //Compute the position again + pos_data = GF_CALLOC (1, sizeof (*pos_data), gf_afr_mt_pos_data_t); + if (!pos_data) + goto out; + pos_data->this = this; + pos_data->child = child; + ret = synctask_new (this->ctx->env, afr_syncop_find_child_position, + afr_handle_child_up, NULL, pos_data); + if (ret) + goto out; +out: + return; +} static int get_pathinfo_host (char *pathinfo, char *hostname, size_t size) @@ -81,380 +1142,634 @@ out: return ret; } -static int -_crawl_directory (loc_t *loc, pid_t pid) +int +afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data, + loc_t *dirloc) { - xlator_t *this = NULL; - afr_private_t *priv = NULL; - fd_t *fd = NULL; - off_t offset = 0; - loc_t entry_loc = {0}; - gf_dirent_t *entry = NULL; - gf_dirent_t *tmp = NULL; - gf_dirent_t entries; - struct iatt iatt = {0}; - struct iatt parent = {0};; - char *file_path = NULL; - int ret = 0; - gf_boolean_t free_entries = _gf_false; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + void *index_gfid = NULL; + void *base_indices_holder_vgfid = NULL; + loc_t rootloc = {0}; + struct iatt iattr = {0}; + struct iatt parent = {0}; + int ret = 0; + xlator_t *readdir_xl = crawl_data->readdir_xl; - INIT_LIST_HEAD (&entries.list); - this = THIS; priv = this->private; + if (crawl_data->crawl == FULL) { + afr_build_root_loc (this, dirloc); + } else if (crawl_data->crawl == INDEX) { + afr_build_root_loc (this, &rootloc); + ret = syncop_getxattr (readdir_xl, &rootloc, &xattr, + GF_XATTROP_INDEX_GFID); + if (ret < 0) + goto out; + ret = dict_get_ptr (xattr, GF_XATTROP_INDEX_GFID, &index_gfid); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "failed to get index " + "dir gfid on %s", readdir_xl->name); + goto out; + } + if (!index_gfid) { + gf_log (this->name, GF_LOG_ERROR, "index gfid empty " + "on %s", readdir_xl->name); + ret = -1; + goto out; + } + uuid_copy (dirloc->gfid, index_gfid); + dirloc->path = ""; + dirloc->inode = inode_new (priv->root_inode->table); + ret = syncop_lookup (readdir_xl, dirloc, NULL, + &iattr, NULL, &parent); + if (ret < 0) { + if (errno != ENOENT) { + gf_log (this->name, GF_LOG_ERROR, "lookup " + "failed on index dir on %s - (%s)", + readdir_xl->name, strerror (errno)); + } + goto out; + } + ret = _link_inode_update_loc (this, dirloc, &iattr); + if (ret) + goto out; + } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) { + afr_build_root_loc (this, &rootloc); + ret = syncop_getxattr (readdir_xl, &rootloc, &xattr, + GF_BASE_INDICES_HOLDER_GFID); + if (ret < 0) + goto out; + ret = dict_get_ptr (xattr, GF_BASE_INDICES_HOLDER_GFID, + &base_indices_holder_vgfid); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "index gfid empty " + "on %s", readdir_xl->name); + ret = -1; + goto out; + } + if (!base_indices_holder_vgfid) { + gf_log (this->name, GF_LOG_ERROR, "Base indices holder" + "virtual gfid is null on %s", readdir_xl->name); + ret = -1; + goto out; + } + uuid_copy (dirloc->gfid, base_indices_holder_vgfid); + dirloc->path = ""; + dirloc->inode = inode_new (priv->root_inode->table); + ret = syncop_lookup (readdir_xl, dirloc, NULL, &iattr, NULL, + &parent); + if (ret < 0) { + if (errno != ENOENT) { + gf_log (this->name, GF_LOG_ERROR, "lookup " + "failed for base_indices_holder dir" + " on %s - (%s)", readdir_xl->name, + strerror (errno)); - GF_ASSERT (loc->inode); + } else { + gf_log (this->name, GF_LOG_ERROR, "base_indices" + "_holder is not yet created."); + } + goto out; + } + ret = _link_inode_update_loc (this, dirloc, &iattr); + if (ret) + goto out; + } + ret = 0; +out: + if (xattr) + dict_unref (xattr); + loc_wipe (&rootloc); + return ret; +} - gf_log (this->name, GF_LOG_DEBUG, "crawling %s", loc->path); - fd = fd_create (loc->inode, pid); - if (!fd) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to create fd for %s", loc->path); - goto out; +int +afr_crawl_opendir (xlator_t *this, afr_crawl_data_t *crawl_data, fd_t **dirfd, + loc_t *dirloc) +{ + fd_t *fd = NULL; + int ret = 0; + + if (crawl_data->crawl == FULL) { + fd = fd_create (dirloc->inode, crawl_data->pid); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to create fd for %s", dirloc->path); + ret = -1; + goto out; + } + + ret = syncop_opendir (crawl_data->readdir_xl, dirloc, fd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "opendir failed on %s", dirloc->path); + goto out; + } + } else { + fd = fd_anonymous (dirloc->inode); } + ret = 0; +out: + if (!ret) + *dirfd = fd; + return ret; +} - if (!loc->parent) { - ret = syncop_lookup (this, loc, NULL, - &iatt, NULL, &parent); +xlator_t* +afr_crawl_readdir_xl_get (xlator_t *this, afr_crawl_data_t *crawl_data) +{ + afr_private_t *priv = this->private; + + if (crawl_data->crawl == FULL) { + return this; + } else { + return priv->children[crawl_data->child]; } + return NULL; +} - ret = syncop_opendir (this, loc, fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "opendir failed on %s", loc->path); - goto out; +int +afr_crawl_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, + gf_dirent_t *entry, afr_crawl_data_t *crawl_data) +{ + int ret = -1; + afr_private_t *priv = NULL; + + priv = this->private; + if (crawl_data->crawl == FULL) { + ret = afr_build_child_loc (this, child, parent, entry->d_name); + } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) { + ret = _build_index_loc (this, child, entry->d_name, parent); + if (ret) + goto out; + child->inode = inode_new (priv->root_inode->table); + if (!child->inode) { + ret = -1; + goto out; + } + child->path = NULL; + } else { + child->inode = inode_new (priv->root_inode->table); + if (!child->inode) + goto out; + uuid_parse (entry->d_name, child->gfid); + ret = _loc_assign_gfid_path (child); } +out: + return ret; +} - while (syncop_readdirp (this, fd, 131072, offset, &entries)) { - ret = 0; - free_entries = _gf_true; - if (afr_up_children_count (priv->child_up, - priv->child_count) < 2) { - gf_log (this->name, GF_LOG_ERROR, "Stopping crawl as " - "< 2 children are up"); +static int +_process_entries (xlator_t *this, loc_t *parentloc, gf_dirent_t *entries, + off_t *offset, afr_crawl_data_t *crawl_data) +{ + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + int ret = 0; + loc_t entry_loc = {0}; + fd_t *fd = NULL; + struct iatt iattr = {0}; + + list_for_each_entry_safe (entry, tmp, &entries->list, list) { + if (!_crawl_proceed (this, crawl_data->child, + crawl_data->crawl_flags, NULL)) { ret = -1; goto out; } + *offset = entry->d_off; + if (IS_ENTRY_CWD (entry->d_name) || + IS_ENTRY_PARENT (entry->d_name)) + continue; + if ((crawl_data->crawl == FULL) && + uuid_is_null (entry->d_stat.ia_gfid)) { + gf_log (this->name, GF_LOG_WARNING, "%s/%s: No " + "gfid present skipping", + parentloc->path, entry->d_name); + continue; + } - if (list_empty (&entries.list)) + loc_wipe (&entry_loc); + ret = afr_crawl_build_child_loc (this, &entry_loc, parentloc, + entry, crawl_data); + if (ret) goto out; - list_for_each_entry_safe (entry, tmp, &entries.list, list) { - offset = entry->d_off; - if (IS_ENTRY_CWD (entry->d_name) || - IS_ENTRY_PARENT (entry->d_name)) - continue; + ret = crawl_data->process_entry (this, crawl_data, entry, + &entry_loc, parentloc, &iattr); - file_path = afr_build_file_path (loc, entry); - if (!file_path) { - ret = -1; - goto out; - } + if (crawl_data->crawl == INDEX_TO_BE_HEALED && ret) { + goto out; + } else if (ret) { + continue; + } - loc_wipe (&entry_loc); - afr_build_child_loc (loc, &entry_loc, - file_path, entry->d_name); + if ((crawl_data->crawl == INDEX) || + (crawl_data->crawl == INDEX_TO_BE_HEALED)) + continue; - gf_log (this->name, GF_LOG_DEBUG, - "found readdir entry=%s", entry->d_name); + if (!IA_ISDIR (iattr.ia_type)) + continue; + fd = NULL; + ret = afr_crawl_opendir (this, crawl_data, &fd, &entry_loc); + if (ret) + continue; + ret = _crawl_directory (fd, &entry_loc, crawl_data); + if (fd) + fd_unref (fd); + } + ret = 0; +out: + if ((crawl_data->crawl == INDEX_TO_BE_HEALED) && ret) { + gf_log (this->name, GF_LOG_ERROR,"Failed to get the hardlink " + "count"); + } + loc_wipe (&entry_loc); + return ret; +} - ret = syncop_lookup (this, &entry_loc, NULL, - &iatt, NULL, &parent); +static int +_crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data) +{ + xlator_t *this = NULL; + off_t offset = 0; + gf_dirent_t entries; + int ret = 0; + gf_boolean_t free_entries = _gf_false; + xlator_t *readdir_xl = crawl_data->readdir_xl; - //Don't fail the crawl if lookup fails as it - //could be because of split-brain - if (ret || (!IA_ISDIR (iatt.ia_type))) - continue; - ret = _crawl_directory (&entry_loc, pid); + INIT_LIST_HEAD (&entries.list); + this = THIS; + + GF_ASSERT (loc->inode); + + if (crawl_data->crawl == FULL) + gf_log (this->name, GF_LOG_DEBUG, "crawling %s", loc->path); + else + gf_log (this->name, GF_LOG_DEBUG, "crawling INDEX %s", + uuid_utoa (loc->gfid)); + + while (1) { + if (crawl_data->crawl == FULL) + ret = syncop_readdirp (readdir_xl, fd, 131072, offset, + NULL, &entries); + else + ret = syncop_readdir (readdir_xl, fd, 131072, offset, + &entries); + if (ret <= 0) + break; + ret = 0; + free_entries = _gf_true; + + if (!_crawl_proceed (this, crawl_data->child, + crawl_data->crawl_flags, NULL)) { + ret = -1; + goto out; } + if (list_empty (&entries.list)) + goto out; + ret = _process_entries (this, loc, &entries, &offset, + crawl_data); + if ((ret < 0) && (crawl_data->crawl == INDEX_TO_BE_HEALED)) { + goto out; + } gf_dirent_free (&entries); free_entries = _gf_false; } ret = 0; out: - if (entry_loc.path) - loc_wipe (&entry_loc); if (free_entries) gf_dirent_free (&entries); return ret; } +static char* +position_str_get (afr_child_pos_t pos) +{ + switch (pos) { + case AFR_POS_UNKNOWN: + return "unknown"; + case AFR_POS_LOCAL: + return "local"; + case AFR_POS_REMOTE: + return "remote"; + } + return NULL; +} + int -afr_find_child_position (xlator_t *this, int child) +afr_find_child_position (xlator_t *this, int child, afr_child_pos_t *pos) { afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; dict_t *xattr_rsp = NULL; loc_t loc = {0}; int ret = 0; - gf_boolean_t local = _gf_false; - char *pathinfo = NULL; - afr_child_pos_t *pos = NULL; - inode_table_t *itable = NULL; + char *node_uuid = NULL; priv = this->private; - pos = &priv->shd.pos[child]; + shd = &priv->shd; - if (*pos != AFR_POS_UNKNOWN) { - goto out; - } - - //TODO: Hack to make the root_loc hack work - LOCK (&priv->lock); - { - if (!priv->root_inode) { - itable = inode_table_new (0, this); - if (!itable) - goto unlock; - priv->root_inode = inode_new (itable); - if (!priv->root_inode) - goto unlock; - } - } -unlock: - UNLOCK (&priv->lock); - - if (!priv->root_inode) { - ret = -1; - goto out; - } - afr_build_root_loc (priv->root_inode, &loc); + afr_build_root_loc (this, &loc); ret = syncop_getxattr (priv->children[child], &loc, &xattr_rsp, - GF_XATTR_PATHINFO_KEY); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "getxattr failed on child " - "%d", child); + GF_XATTR_NODE_UUID_KEY); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "getxattr failed on %s - " + "(%s)", priv->children[child]->name, strerror (errno)); goto out; } - ret = dict_get_str (xattr_rsp, GF_XATTR_PATHINFO_KEY, &pathinfo); + ret = dict_get_str (xattr_rsp, GF_XATTR_NODE_UUID_KEY, &node_uuid); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Pathinfo key not found on " - "child %d", child); + gf_log (this->name, GF_LOG_ERROR, "node-uuid key not found on " + "child %s", priv->children[child]->name); goto out; } - ret = afr_local_pathinfo (pathinfo, &local); - if (ret) - goto out; - if (local) + if (!strcmp (node_uuid, shd->node_uuid)) *pos = AFR_POS_LOCAL; else *pos = AFR_POS_REMOTE; - gf_log (this->name, GF_LOG_INFO, "child %d is %d", child, *pos); + gf_log (this->name, GF_LOG_DEBUG, "child %s is %s", + priv->children[child]->name, position_str_get (*pos)); out: + if (ret) + *pos = AFR_POS_UNKNOWN; + loc_wipe (&loc); return ret; } -static int -afr_crawl_done (int ret, call_frame_t *sync_frame, void *data) +int +afr_syncop_find_child_position (void *data) { - GF_FREE (data); - STACK_DESTROY (sync_frame->root); - return 0; + shd_pos_t *pos_data = data; + int ret = 0; + + ret = afr_find_child_position (pos_data->this, pos_data->child, + &pos_data->pos); + return ret; } static int -afr_find_all_children_postions (xlator_t *this) +afr_dir_crawl (void *data) { - int ret = -1; - int i = 0; - gf_boolean_t succeeded = _gf_false; - afr_private_t *priv = NULL; + xlator_t *this = NULL; + int ret = -1; + xlator_t *readdir_xl = NULL; + fd_t *fd = NULL; + loc_t dirloc = {0}; + afr_crawl_data_t *crawl_data = data; - priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (priv->child_up[i] != 1) - continue; - ret = afr_find_child_position (this, i); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to determine if the " - "child %s is local.", - priv->children[i]->name); - continue; + this = THIS; + + if (!_crawl_proceed (this, crawl_data->child, crawl_data->crawl_flags, + NULL)) + goto out; + + readdir_xl = afr_crawl_readdir_xl_get (this, crawl_data); + if (!readdir_xl) + goto out; + crawl_data->readdir_xl = readdir_xl; + + ret = afr_crawl_build_start_loc (this, crawl_data, &dirloc); + if (ret) + goto out; + + ret = afr_crawl_opendir (this, crawl_data, &fd, &dirloc); + if (ret) { + if (crawl_data->crawl == INDEX_TO_BE_HEALED) { + gf_log (this->name, GF_LOG_ERROR, "Failed to open base_" + "indices_holder"); } - succeeded = _gf_true; + goto out; } - if (succeeded) - ret = 0; + + ret = _crawl_directory (fd, &dirloc, crawl_data); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Crawl failed on %s", + readdir_xl->name); + else + gf_log (this->name, GF_LOG_DEBUG, "Crawl completed " + "on %s", readdir_xl->name); + if (crawl_data->crawl == INDEX) + dirloc.path = NULL; +out: + if (fd) + fd_unref (fd); + if ((crawl_data->crawl == INDEX) || + (crawl_data->crawl == INDEX_TO_BE_HEALED )) + dirloc.path = NULL; + loc_wipe (&dirloc); return ret; } -static gf_boolean_t -afr_local_child_exists (afr_child_pos_t *pos, unsigned int child_count) +char * +get_crawl_type_in_string (afr_crawl_type_t crawl) { - int i = 0; - gf_boolean_t local = _gf_false; - - for (i = 0; i < child_count; i++, pos++) { - if (*pos == AFR_POS_LOCAL) { - local = _gf_true; - break; - } + char *index = "INDEX"; + char *full = "FULL"; + char *crawl_type = NULL; + + if (crawl == INDEX){ + crawl_type = index; + } else if (crawl == FULL) { + crawl_type = full; } - return local; + + return crawl_type; } -int -afr_init_child_position (xlator_t *this, int child) +static int +afr_allocate_crawl_event (xlator_t *this, int child, afr_crawl_type_t crawl) { - int ret = 0; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int ret = 0; + shd_crawl_event_t *crawl_event = NULL; + time_t get_time = 0; - if (child == AFR_ALL_CHILDREN) { - ret = afr_find_all_children_postions (this); - } else { - ret = afr_find_child_position (this, child); + priv = this->private; + shd = &priv->shd; + + crawl_event = GF_CALLOC (sizeof (shd_crawl_event_t), 1, + gf_afr_mt_shd_crawl_event_t); + if (!crawl_event) { + ret = -1; + goto out; } + + get_time = time(NULL); + if (get_time == ((time_t)-1)) { + ret = -1; + goto out; + } + + crawl_event->start_time_str = gf_strdup (ctime(&get_time)); + + crawl_event->crawl_type = get_crawl_type_in_string (crawl); + if (!crawl_event->crawl_type) { + ret = -1; + goto out; + } + LOCK (&priv->lock); + { + shd->crawl_events[child] = crawl_event; + } + UNLOCK (&priv->lock); + ret = 0; +out: return ret; + } -int -afr_is_local_child (afr_self_heald_t *shd, int child, unsigned int child_count) +static int +afr_put_crawl_event_in_eh (xlator_t *this, int child) { - gf_boolean_t local = _gf_false; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int ret = 0; + time_t get_time = 0; + shd_crawl_event_t **crawl_event = NULL; - if (child == AFR_ALL_CHILDREN) - local = afr_local_child_exists (shd->pos, child_count); - else - local = (shd->pos[child] == AFR_POS_LOCAL); + priv = this->private; + shd = &priv->shd; - return local; + get_time = time(NULL); + if (get_time == ((time_t)-1)) { + ret = -1; + goto out; + } + crawl_event = (shd_crawl_event_t**)shd->crawl_events; + LOCK (&priv->lock); + { + crawl_event[child]->end_time_str = gf_strdup (ctime(&get_time)); + ret = eh_save_history (shd->statistics[child], + crawl_event[child]); + crawl_event[child] = NULL; + } + UNLOCK (&priv->lock); +out: + return ret; } static int -afr_crawl_directory (xlator_t *this, pid_t pid) +afr_dir_exclusive_crawl (void *data) { afr_private_t *priv = NULL; afr_self_heald_t *shd = NULL; - loc_t loc = {0}; gf_boolean_t crawl = _gf_false; - int ret = 0; + int ret = 0; + int child = -1; + xlator_t *this = NULL; + afr_crawl_data_t *crawl_data = data; + this = THIS; priv = this->private; shd = &priv->shd; - + child = crawl_data->child; LOCK (&priv->lock); { - if (shd->inprogress) { - shd->pending = _gf_true; + if (shd->inprogress[child]) { + if (shd->pending[child] != FULL) + shd->pending[child] = crawl_data->crawl; } else { - shd->inprogress = _gf_true; + shd->inprogress[child] = _gf_true; crawl = _gf_true; } } UNLOCK (&priv->lock); - if (!priv->root_inode) { - ret = -1; + if (!crawl) { + gf_log (this->name, GF_LOG_INFO, "Another crawl is in progress " + "for %s", priv->children[child]->name); goto out; } - if (!crawl) - goto out; - - afr_build_root_loc (priv->root_inode, &loc); - while (crawl) { - ret = _crawl_directory (&loc, pid); + do { + ret = afr_allocate_crawl_event (this, child, crawl_data->crawl); if (ret) - gf_log (this->name, GF_LOG_ERROR, "Crawl failed"); - else - gf_log (this->name, GF_LOG_INFO, "Crawl completed"); + goto out; + afr_dir_crawl (data); + + ret = afr_put_crawl_event_in_eh (this, child); + if (ret < 0) + goto out; + LOCK (&priv->lock); { - if (shd->pending) { - shd->pending = _gf_false; + if (shd->pending[child] != NONE) { + crawl_data->crawl = shd->pending[child]; + shd->pending[child] = NONE; } else { - shd->inprogress = _gf_false; + shd->inprogress[child] = _gf_false; crawl = _gf_false; } } UNLOCK (&priv->lock); - } -out: - return ret; -} - -static int -afr_crawl (void *data) -{ - xlator_t *this = NULL; - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - int ret = -1; - afr_crawl_data_t *crawl_data = data; - - this = THIS; - priv = this->private; - shd = &priv->shd; - - ret = afr_init_child_position (this, crawl_data->child); - if (ret) - goto out; - - if (!afr_is_local_child (shd, crawl_data->child, priv->child_count)) - goto out; - - ret = afr_crawl_directory (this, crawl_data->pid); + } while (crawl); out: return ret; } void -afr_proactive_self_heal (xlator_t *this, int idx) +afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl, + process_entry_cbk_t process_entry, void *op_data, + gf_boolean_t exclusive, int crawl_flags, + afr_crawl_done_cbk_t crawl_done) { afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; call_frame_t *frame = NULL; afr_crawl_data_t *crawl_data = NULL; int ret = 0; + int (*crawler) (void*) = NULL; priv = this->private; - shd = &priv->shd; - if (!shd->enabled) - goto out; - - if ((idx != AFR_ALL_CHILDREN) && - (shd->pos[idx] == AFR_POS_REMOTE)) - goto out; frame = create_frame (this, this->ctx->pool); if (!frame) goto out; - afr_set_lk_owner (frame, this); + afr_set_lk_owner (frame, this, frame->root); afr_set_low_priority (frame); crawl_data = GF_CALLOC (1, sizeof (*crawl_data), - gf_afr_mt_afr_crawl_data_t); + gf_afr_mt_crawl_data_t); if (!crawl_data) goto out; + crawl_data->process_entry = process_entry; crawl_data->child = idx; crawl_data->pid = frame->root->pid; - gf_log (this->name, GF_LOG_INFO, "starting crawl for %d", idx); - ret = synctask_new (this->ctx->env, afr_crawl, - afr_crawl_done, frame, crawl_data); + crawl_data->crawl = crawl; + crawl_data->op_data = op_data; + crawl_data->crawl_flags = crawl_flags; + gf_log (this->name, GF_LOG_DEBUG, "starting crawl %d for %s", + crawl_data->crawl, priv->children[idx]->name); + + if (exclusive) + crawler = afr_dir_exclusive_crawl; + else + crawler = afr_dir_crawl; + ret = synctask_new (this->ctx->env, crawler, + crawl_done, frame, crawl_data); if (ret) - gf_log (this->name, GF_LOG_ERROR, "Could not create the " - "task for %d ret %d", idx, ret); + gf_log (this->name, GF_LOG_ERROR, "afr crawl failed for child" + " %d with ret %d", idx, ret); out: return; } -//TODO: This is a hack void -afr_build_root_loc (inode_t *inode, loc_t *loc) +afr_build_root_loc (xlator_t *this, loc_t *loc) { - loc->path = "/"; - loc->name = ""; - loc->inode = inode; - loc->ino = 1; - loc->inode->ino = 1; - loc->inode->ia_type = IA_IFDIR; - memset (loc->inode->gfid, 0, 16); - loc->inode->gfid[15] = 1; + afr_private_t *priv = NULL; + priv = this->private; + loc->path = gf_strdup ("/"); + loc->name = ""; + loc->inode = inode_ref (priv->root_inode); + uuid_copy (loc->gfid, loc->inode->gfid); } int @@ -470,43 +1785,3 @@ afr_set_root_gfid (dict_t *dict) return ret; } - -char * -afr_build_file_path (loc_t *loc, gf_dirent_t *entry) -{ - xlator_t *this = NULL; - char *file_path = NULL; - int pathlen = 0; - size_t total_size = 0; - char *fmt = NULL; - - this = THIS; - - pathlen = STRLEN_0 (loc->path); - - if (IS_ROOT_PATH (loc->path)) { - total_size = pathlen + entry->d_len; - fmt = "%s%s"; - } else { - total_size = pathlen + entry->d_len + 1; /* for the extra '/' in the path */ - fmt = "%s/%s"; - } - - file_path = GF_CALLOC (1, total_size + 1, gf_afr_mt_char); - if (!file_path) - goto out; - - snprintf(file_path, total_size, fmt, loc->path, entry->d_name); -out: - return file_path; -} - -void -afr_build_child_loc (loc_t *parent, loc_t *child, char *path, char *name) -{ - child->path = path; - child->name = name; - - child->parent = inode_ref (parent->inode); - child->inode = inode_new (parent->inode->table); -} diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h index c85c97b25..e0c083754 100644 --- a/xlators/cluster/afr/src/afr-self-heald.h +++ b/xlators/cluster/afr/src/afr-self-heald.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __AFR_SELF_HEALD_H__ @@ -27,18 +18,48 @@ #define AFR_ALL_CHILDREN -1 typedef struct afr_crawl_data_ { - int child; - pid_t pid; + int child; + pid_t pid; + afr_crawl_type_t crawl; + xlator_t *readdir_xl; + void *op_data; + int crawl_flags; + int (*process_entry) (xlator_t *this, struct afr_crawl_data_ *crawl_data, + gf_dirent_t *entry, loc_t *child, loc_t *parent, + struct iatt *iattr); } afr_crawl_data_t; -void afr_proactive_self_heal (xlator_t *this, int idx); +typedef struct crawl_event_stats_ { + uint64_t healed_count; + uint64_t split_brain_count; + uint64_t heal_failed_count; + char *start_time_str; + char *end_time_str; + char *crawl_type; + gf_boolean_t crawl_inprogress; +} shd_crawl_event_t; -void afr_build_root_loc (inode_t *inode, loc_t *loc); +void _destroy_crawl_event_data (void *data); +void _destroy_shd_event_data (void *data); -int afr_set_root_gfid (dict_t *dict); +typedef int (*process_entry_cbk_t) (xlator_t *this, afr_crawl_data_t *crawl_data, + gf_dirent_t *entry, loc_t *child, loc_t *parent, + struct iatt *iattr); -char * afr_build_file_path (loc_t *loc, gf_dirent_t *entry); +void afr_build_root_loc (xlator_t *this, loc_t *loc); + +int afr_set_root_gfid (dict_t *dict); void -afr_build_child_loc (loc_t *parent, loc_t *child, char *path, char *name); +afr_proactive_self_heal (void *data); + +int +afr_xl_op (xlator_t *this, dict_t *input, dict_t *output); + +/* + * In addition to its self-heal use, this is used to find a local default + * read_child. + */ +int +afr_local_pathinfo (char *pathinfo, gf_boolean_t *local); #endif /* __AFR_SELF_HEALD_H__ */ diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 795fec255..20306e469 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -1,25 +1,17 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include "dict.h" #include "byte-order.h" #include "common-utils.h" +#include "timer.h" #include "afr.h" #include "afr-transaction.h" @@ -32,48 +24,75 @@ of RENAME */ #define LOCKED_LOWER 0x2 /* for lower_path of RENAME */ - afr_fd_ctx_t * -afr_fd_ctx_get (fd_t *fd, xlator_t *this) +__afr_fd_ctx_get (fd_t *fd, xlator_t *this) { uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; int ret = 0; + afr_fd_ctx_t *fd_ctx = NULL; + int i = 0; + afr_private_t *priv = NULL; - ret = fd_ctx_get (fd, this, &ctx); + priv = this->private; - if (ret < 0) - goto out; + ret = __fd_ctx_get (fd, this, &ctx); - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + if (ret < 0 && fd_is_anonymous (fd)) { + ret = __afr_fd_ctx_set (this, fd); + if (ret < 0) + goto out; + + ret = __fd_ctx_get (fd, this, &ctx); + if (ret < 0) + goto out; + + fd_ctx = (afr_fd_ctx_t *)(long) ctx; + for (i = 0; i < priv->child_count; i++) + fd_ctx->opened_on[i] = AFR_FD_OPENED; + } + fd_ctx = (afr_fd_ctx_t *)(long) ctx; out: return fd_ctx; } +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ + afr_fd_ctx_t *fd_ctx = NULL; + + LOCK(&fd->lock); + { + fd_ctx = __afr_fd_ctx_get (fd, this); + } + UNLOCK(&fd->lock); + + return fd_ctx; +} + + static void -afr_pid_save (call_frame_t *frame) +afr_save_lk_owner (call_frame_t *frame) { afr_local_t * local = NULL; local = frame->local; - local->saved_pid = frame->root->pid; + local->saved_lk_owner = frame->root->lk_owner; } static void -afr_pid_restore (call_frame_t *frame) +afr_restore_lk_owner (call_frame_t *frame) { afr_local_t * local = NULL; local = frame->local; - frame->root->pid = local->saved_pid; + frame->root->lk_owner = local->saved_lk_owner; } - static void __mark_all_pending (int32_t *pending[], int child_count, afr_transaction_type type) @@ -126,51 +145,23 @@ out: return; } - static void -__mark_pre_op_undone_on_fd (call_frame_t *frame, xlator_t *this, int child_index) -{ - afr_local_t *local = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - - local = frame->local; - - if (!local->fd) - return; - - fd_ctx = afr_fd_ctx_get (local->fd, this); - - if (!fd_ctx) - goto out; - - LOCK (&local->fd->lock); - { - if (local->transaction.type == AFR_DATA_TRANSACTION) - fd_ctx->pre_op_done[child_index]--; - } - UNLOCK (&local->fd->lock); -out: - return; -} - - -static void -__mark_down_children (int32_t *pending[], int child_count, - unsigned char *child_up, afr_transaction_type type) +__mark_non_participant_children (int32_t *pending[], int child_count, + unsigned char *participants, + afr_transaction_type type) { int i = 0; int j = 0; + j = afr_index_for_transaction_type (type); for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); - - if (!child_up[i]) + if (!participants[i]) pending[i][j] = 0; } } -static void +void __mark_all_success (int32_t *pending[], int child_count, afr_transaction_type type) { @@ -183,6 +174,54 @@ __mark_all_success (int32_t *pending[], int child_count, } } +void +_set_all_child_errno (int *child_errno, unsigned int child_count) +{ + int i = 0; + + for (i = 0; i < child_count; i++) + if (child_errno[i] == 0) + child_errno[i] = ENOTCONN; +} + +void +afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + fd_t *fd = NULL; + + local = frame->local; + priv = this->private; + fd = local->fd; + + __mark_all_success (local->pending, priv->child_count, + local->transaction.type); + + _set_all_child_errno (local->child_errno, priv->child_count); + + /* Perform fops with the lk-owner from top xlator. + * Eg: lk-owner of posix-lk and flush should be same, + * flush cant clear the posix-lks without that lk-owner. + */ + afr_save_lk_owner (frame); + frame->root->lk_owner = + local->transaction.main_frame->root->lk_owner; + + + /* The wake up needs to happen independent of + what type of fop arrives here. If it was + a write, then it has already inherited the + lock and changelog. If it was not a write, + then the presumption of the optimization (of + optimizing for successive write operations) + fails. + */ + if (fd) + afr_delayed_changelog_wake_up (this, fd); + local->transaction.fop (frame, this); +} + static int __changelog_enabled (afr_private_t *priv, afr_transaction_type type) @@ -215,39 +254,7 @@ __changelog_enabled (afr_private_t *priv, afr_transaction_type type) static int -__changelog_needed_pre_op (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - - int op_ret = 0; - - priv = this->private; - local = frame->local; - - if (__changelog_enabled (priv, local->transaction.type)) { - switch (local->op) { - - case GF_FOP_WRITE: - case GF_FOP_FTRUNCATE: - op_ret = 1; - break; - - case GF_FOP_FLUSH: - op_ret = 0; - break; - - default: - op_ret = 1; - } - } - - return op_ret; -} - - -static int -__changelog_needed_post_op (call_frame_t *frame, xlator_t *this) +__fop_changelog_needed (call_frame_t *frame, xlator_t *this) { afr_private_t * priv = NULL; afr_local_t * local = NULL; @@ -278,64 +285,42 @@ __changelog_needed_post_op (call_frame_t *frame, xlator_t *this) return op_ret; } - -static int -afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending) +int +afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, + int child, afr_xattrop_type_t op) { int i = 0; int ret = 0; + if (op == LOCAL_FIRST) { + ret = dict_set_static_bin (xattr, priv->pending_key[child], + pending[child], + AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); + if (ret) + goto out; + } for (i = 0; i < priv->child_count; i++) { + if (i == child) + continue; ret = dict_set_static_bin (xattr, priv->pending_key[i], - pending[i], 3 * sizeof (int32_t)); + pending[i], + AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); /* 3 = data+metadata+entry */ if (ret < 0) goto out; } - -out: - return ret; -} - - -static int -afr_set_piggyback_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, - afr_transaction_type type) -{ - int i = 0; - int ret = 0; - int *arr = NULL; - int index = 0; - size_t pending_xattr_size = 3 * sizeof (int32_t); - /* 3 = data+metadata+entry */ - - index = afr_index_for_transaction_type (type); - - for (i = 0; i < priv->child_count; i++) { - arr = GF_CALLOC (1, pending_xattr_size, - gf_afr_mt_char); - if (!arr) { - ret = -1; - goto out; - } - - memcpy (arr, pending[i], pending_xattr_size); - - arr[index] = hton32 (ntoh32(arr[index]) + 1); - - ret = dict_set_bin (xattr, priv->pending_key[i], - arr, pending_xattr_size); - - if (ret < 0) + if (op == LOCAL_LAST) { + ret = dict_set_static_bin (xattr, priv->pending_key[child], + pending[child], + AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); + if (ret) goto out; } - out: return ret; } - int afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) { @@ -363,20 +348,18 @@ afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) int32_t afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) { afr_internal_lock_t *int_lock = NULL; afr_private_t *priv = NULL; afr_local_t *local = NULL; - int child_index = 0; int call_count = -1; priv = this->private; local = frame->local; int_lock = &local->internal_lock; - child_index = (long) cookie; - LOCK (&frame->lock); { call_count = --local->call_count; @@ -384,6 +367,11 @@ afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, UNLOCK (&frame->lock); if (call_count == 0) { + if (local->transaction.resume_stub) { + call_resume (local->transaction.resume_stub); + local->transaction.resume_stub = NULL; + } + if (afr_lock_server_count (priv, local->transaction.type) == 0) { local->transaction.done (frame, this); } else { @@ -417,83 +405,246 @@ afr_transaction_rm_stale_children (call_frame_t *frame, xlator_t *this, local = frame->local; pending = local->pending; - stale_children = afr_children_create (priv->child_count); - if (!stale_children) + if (local->op_ret < 0) goto out; - fresh_children = local->fresh_children; read_child = afr_inode_get_read_ctx (this, inode, fresh_children); - - GF_ASSERT (read_child >= 0); - - if (pending[read_child][idx] == 0) - read_child = -1; + if (read_child < 0) { + gf_log (this->name, GF_LOG_DEBUG, "Possible split-brain " + "for %s", uuid_utoa (inode->gfid)); + goto out; + } for (i = 0; i < priv->child_count; i++) { if (!afr_is_child_present (fresh_children, priv->child_count, i)) continue; - if (pending[i][idx] == 0) { - /* child is down or op failed on it */ - rm_stale_children = _gf_true; - afr_children_rm_child (fresh_children, i, - priv->child_count); - stale_children[count++] = i; - } - } + if (pending[i][idx]) + continue; + /* child is down or op failed on it */ + if (!stale_children) + stale_children = afr_children_create (priv->child_count); + if (!stale_children) + goto out; - if (!rm_stale_children) { - GF_ASSERT (read_child >= 0); - goto out; + rm_stale_children = _gf_true; + stale_children[count++] = i; + gf_log (this->name, GF_LOG_DEBUG, "Removing stale child " + "%d for %s", i, uuid_utoa (inode->gfid)); } - if (fresh_children[0] == -1) { - //All children failed. leave as-is + if (!rm_stale_children) goto out; - } - if (read_child == -1) - read_child = fresh_children[0]; - afr_inode_rm_stale_children (this, inode, read_child, stale_children); + afr_inode_rm_stale_children (this, inode, stale_children); out: - if (stale_children) - GF_FREE (stale_children); + GF_FREE (stale_children); return; } -int -afr_fxattrop_call_count (afr_transaction_type type, afr_internal_lock_t *int_lock, - unsigned int child_count) +afr_inodelk_t* +afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom) { - int call_count = 0; + afr_inodelk_t *inodelk = NULL; + int i = 0; + for (i = 0; int_lock->inodelk[i].domain; i++) { + inodelk = &int_lock->inodelk[i]; + if (strcmp (dom, inodelk->domain) == 0) + return inodelk; + } + return NULL; +} + +unsigned char* +afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock) +{ + unsigned char *locked_nodes = NULL; + afr_inodelk_t *inodelk = NULL; switch (type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - call_count = afr_locked_children_count (int_lock->inode_locked_nodes, - child_count); + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + locked_nodes = inodelk->locked_nodes; break; case AFR_ENTRY_TRANSACTION: case AFR_ENTRY_RENAME_TRANSACTION: - call_count = afr_locked_children_count (int_lock->entry_locked_nodes, - child_count); + /*Because same set of subvols participate in all lockee + * entities*/ + locked_nodes = int_lock->lockee[0].locked_nodes; break; } + return locked_nodes; +} - if (type == AFR_ENTRY_RENAME_TRANSACTION) { +int +afr_changelog_pre_op_call_count (afr_transaction_type type, + afr_internal_lock_t *int_lock, + unsigned int child_count) +{ + int call_count = 0; + unsigned char *locked_nodes = NULL; + + locked_nodes = afr_locked_nodes_get (type, int_lock); + GF_ASSERT (locked_nodes); + + call_count = afr_locked_children_count (locked_nodes, child_count); + if (type == AFR_ENTRY_RENAME_TRANSACTION) call_count *= 2; - } + return call_count; } +int +afr_changelog_post_op_call_count (afr_transaction_type type, + unsigned char *pre_op, + unsigned int child_count) +{ + int call_count = 0; + + call_count = afr_pre_op_done_children_count (pre_op, child_count); + if (type == AFR_ENTRY_RENAME_TRANSACTION) + call_count *= 2; + + return call_count; +} + +void +afr_compute_txn_changelog (afr_local_t *local, afr_private_t *priv) +{ + int i = 0; + int index = 0; + int32_t postop = 0; + int32_t preop = 1; + int32_t **txn_changelog = NULL; + + txn_changelog = local->transaction.txn_changelog; + index = afr_index_for_transaction_type (local->transaction.type); + for (i = 0; i < priv->child_count; i++) { + postop = ntoh32 (local->pending[i][index]); + txn_changelog[i][index] = hton32 (postop + preop); + } +} + +afr_xattrop_type_t +afr_get_postop_xattrop_type (int32_t **pending, int optimized, int child, + afr_transaction_type type) +{ + int index = 0; + afr_xattrop_type_t op = LOCAL_LAST; + + index = afr_index_for_transaction_type (type); + if (optimized && !pending[child][index]) + op = LOCAL_FIRST; + return op; +} + +void +afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr, + int optimized, int child) +{ + int32_t **txn_changelog = NULL; + int32_t **changelog = NULL; + afr_private_t *priv = NULL; + int ret = 0; + afr_xattrop_type_t op = LOCAL_LAST; + + priv = this->private; + txn_changelog = local->transaction.txn_changelog; + op = afr_get_postop_xattrop_type (local->pending, optimized, child, + local->transaction.type); + if (optimized) + changelog = txn_changelog; + else + changelog = local->pending; + ret = afr_set_pending_dict (priv, xattr, changelog, child, op); + if (ret < 0) + gf_log (this->name, GF_LOG_INFO, + "failed to set pending entry"); +} + + +gf_boolean_t +afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int index = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + index = afr_index_for_transaction_type (local->transaction.type); + + for (i = 0; i < priv->child_count; i++) { + if (local->pending[i][index] == 0) + return _gf_false; + } + + return _gf_true; +} + +static void +afr_dir_fop_handle_all_fop_failures (call_frame_t *frame) +{ + xlator_t *this = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + this = frame->this; + local = frame->local; + priv = this->private; + + if ((local->transaction.type != AFR_ENTRY_TRANSACTION) && + (local->transaction.type != AFR_ENTRY_RENAME_TRANSACTION)) + return; + + if (local->op_ret >= 0) + goto out; + + __mark_all_success (local->pending, priv->child_count, + local->transaction.type); +out: + return; +} + +static void +afr_data_handle_quota_errors (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + gf_boolean_t all_quota_failures = _gf_false; + + local = frame->local; + priv = this->private; + if (local->transaction.type != AFR_DATA_TRANSACTION) + return; + /* + * Idea is to not leave the file in FOOL-FOOL scenario in case on + * all the bricks data transaction failed with EDQUOT to avoid + * increasing un-necessary load of self-heals in the system. + */ + all_quota_failures = _gf_true; + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] && + (local->child_errno[i] != EDQUOT)) { + all_quota_failures = _gf_false; + break; + } + } + if (all_quota_failures) + __mark_all_success (local->pending, priv->child_count, + local->transaction.type); +} int -afr_changelog_post_op (call_frame_t *frame, xlator_t *this) +afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) { afr_private_t * priv = this->private; afr_internal_lock_t *int_lock = NULL; - int ret = 0; int i = 0; int call_count = 0; @@ -501,14 +652,17 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) afr_fd_ctx_t *fdctx = NULL; dict_t **xattr = NULL; int piggyback = 0; - int index = 0; int nothing_failed = 1; local = frame->local; int_lock = &local->internal_lock; - __mark_down_children (local->pending, priv->child_count, - local->child_up, local->transaction.type); + __mark_non_participant_children (local->pending, priv->child_count, + local->transaction.pre_op, + local->transaction.type); + + afr_data_handle_quota_errors (frame, this); + afr_dir_fop_handle_all_fop_failures (frame); if (local->fd) afr_transaction_rm_stale_children (frame, this, @@ -518,12 +672,12 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) xattr = alloca (priv->child_count * sizeof (*xattr)); memset (xattr, 0, (priv->child_count * sizeof (*xattr))); for (i = 0; i < priv->child_count; i++) { - xattr[i] = get_new_dict (); - dict_ref (xattr[i]); + xattr[i] = dict_new (); } - call_count = afr_fxattrop_call_count (local->transaction.type, int_lock, - priv->child_count); + call_count = afr_changelog_post_op_call_count (local->transaction.type, + local->transaction.pre_op, + priv->child_count); local->call_count = call_count; if (local->fd) @@ -531,94 +685,70 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) if (call_count == 0) { /* no child is up */ - for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); - } - int_lock->lock_cbk = local->transaction.done; afr_unlock (frame, this); - return 0; + goto out; } - /* check if something has failed, to handle piggybacking */ - nothing_failed = 1; - index = afr_index_for_transaction_type (local->transaction.type); - for (i = 0; i < priv->child_count; i++) { - if (local->pending[i][index] == 0) { - nothing_failed = 0; - break; - } - } + nothing_failed = afr_txn_nothing_failed (frame, this); - index = afr_index_for_transaction_type (local->transaction.type); - if (local->optimistic_change_log && - local->transaction.type != AFR_DATA_TRANSACTION) { - /* if nothing_failed, then local->pending[..] == {0 .. 0} */ - for (i = 0; i < priv->child_count; i++) - local->pending[i][index]++; - } + afr_compute_txn_changelog (local , priv); for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i]) - continue; - if (local->fd && !local->fd_open_on[i]) + if (!local->transaction.pre_op[i]) continue; - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); - - + if (local->transaction.type != AFR_DATA_TRANSACTION) + afr_set_postop_dict (local, this, xattr[i], + local->optimistic_change_log, i); switch (local->transaction.type) { case AFR_DATA_TRANSACTION: { if (!fdctx) { + afr_set_postop_dict (local, this, xattr[i], + 0, i); STACK_WIND (frame, afr_changelog_post_op_cbk, priv->children[i], priv->children[i]->fops->xattrop, &local->loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); break; } - LOCK (&local->fd->lock); - { - piggyback = 0; - if (fdctx->pre_op_piggyback[i]) { - fdctx->pre_op_piggyback[i]--; - piggyback = 1; - } - } - UNLOCK (&local->fd->lock); + /* local->transaction.postop_piggybacked[] was + precomputed in is_piggyback_postop() when called from + afr_changelog_post_op_safe() + */ - if (piggyback && !nothing_failed) - ret = afr_set_piggyback_dict (priv, xattr[i], - local->pending, - local->transaction.type); + piggyback = 0; + if (local->transaction.postop_piggybacked[i]) + piggyback = 1; + + afr_set_postop_dict (local, this, xattr[i], + piggyback, i); if (nothing_failed && piggyback) { afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); + this, 1, 0, xattr[i], NULL); } else { - __mark_pre_op_undone_on_fd (frame, this, i); STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->fxattrop, local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); } } break; case AFR_METADATA_TRANSACTION: { - if (nothing_failed) { + if (nothing_failed && local->optimistic_change_log) { afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); + this, 1, 0, xattr[i], + NULL); break; } @@ -627,28 +757,32 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->fxattrop, local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); else STACK_WIND (frame, afr_changelog_post_op_cbk, priv->children[i], priv->children[i]->fops->xattrop, &local->loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); } break; case AFR_ENTRY_RENAME_TRANSACTION: { - if (nothing_failed) { + if (nothing_failed && local->optimistic_change_log) { afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); + this, 1, 0, xattr[i], + NULL); } else { STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->xattrop, &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); } call_count--; } @@ -661,20 +795,17 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) value */ - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); + afr_set_postop_dict (local, this, xattr[i], + local->optimistic_change_log, i); /* fall through */ case AFR_ENTRY_TRANSACTION: { - if (nothing_failed) { + if (nothing_failed && local->optimistic_change_log) { afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); + this, 1, 0, xattr[i], + NULL); break; } @@ -683,13 +814,15 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->fxattrop, local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); else STACK_WIND (frame, afr_changelog_post_op_cbk, priv->children[i], priv->children[i]->fops->xattrop, &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); } break; } @@ -698,6 +831,7 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) break; } +out: for (i = 0; i < priv->child_count; i++) { dict_unref (xattr[i]); } @@ -708,7 +842,8 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) int32_t afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) { afr_local_t * local = NULL; afr_private_t * priv = this->private; @@ -719,17 +854,15 @@ afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, LOCK (&frame->lock); { - if (op_ret == 1) { - /* special op_ret for piggyback */ - } - - if (op_ret == 0) { + switch (op_ret) { + case 0: __mark_pre_op_done_on_fd (frame, this, child_index); - } - - if (op_ret == -1) { - local->child_up[child_index] = 0; - + //fallthrough we need to mark the pre_op + case 1: + local->transaction.pre_op[child_index] = 1; + /* special op_ret for piggyback */ + break; + case -1: if (op_errno == ENOTSUP) { gf_log (this->name, GF_LOG_ERROR, "xattrop not supported by %s", @@ -743,6 +876,7 @@ afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, strerror (op_errno)); } local->op_errno = op_errno; + break; } call_count = --local->call_count; @@ -754,12 +888,7 @@ afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, (local->op_errno == ENOTSUP)) { local->transaction.resume (frame, this); } else { - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); - - afr_pid_restore (frame); - - local->transaction.fop (frame, this); + afr_transaction_perform_fop (frame, this); } } @@ -778,6 +907,7 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; int piggyback = 0; afr_internal_lock_t *int_lock = NULL; + unsigned char *locked_nodes = NULL; local = frame->local; int_lock = &local->internal_lock; @@ -786,22 +916,17 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) memset (xattr, 0, (priv->child_count * sizeof (*xattr))); for (i = 0; i < priv->child_count; i++) { - xattr[i] = get_new_dict (); - dict_ref (xattr[i]); + xattr[i] = dict_new (); } - call_count = afr_fxattrop_call_count (local->transaction.type, int_lock, - priv->child_count); + call_count = afr_changelog_pre_op_call_count (local->transaction.type, + int_lock, + priv->child_count); if (call_count == 0) { - /* no child is up */ - for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); - } - local->internal_lock.lock_cbk = local->transaction.done; afr_unlock (frame, this); - return 0; + goto out; } local->call_count = call_count; @@ -812,14 +937,12 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) if (local->fd) fdctx = afr_fd_ctx_get (local->fd, this); + locked_nodes = afr_locked_nodes_get (local->transaction.type, int_lock); for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i]) - continue; - if (local->fd && !local->fd_open_on[i]) + if (!locked_nodes[i]) continue; - - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); + ret = afr_set_pending_dict (priv, xattr[i], local->pending, + i, LOCAL_FIRST); if (ret < 0) gf_log (this->name, GF_LOG_INFO, @@ -836,7 +959,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->xattrop, &(local->loc), - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); break; } @@ -853,9 +977,12 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) } UNLOCK (&local->fd->lock); + afr_set_delayed_post_op (frame, this); + if (piggyback) afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); + this, 1, 0, xattr[i], + NULL); else STACK_WIND_COOKIE (frame, afr_changelog_pre_op_cbk, @@ -863,14 +990,16 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->fxattrop, local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); } break; case AFR_METADATA_TRANSACTION: { if (local->optimistic_change_log) { afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); + this, 1, 0, xattr[i], + NULL); break; } @@ -881,7 +1010,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->fxattrop, local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); else STACK_WIND_COOKIE (frame, afr_changelog_pre_op_cbk, @@ -889,7 +1019,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->xattrop, &(local->loc), - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); } break; @@ -897,7 +1028,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) { if (local->optimistic_change_log) { afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); + this, 1, 0, xattr[i], + NULL); } else { STACK_WIND_COOKIE (frame, afr_changelog_pre_op_cbk, @@ -905,7 +1037,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->xattrop, &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); } call_count--; @@ -920,8 +1053,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) value */ - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); + ret = afr_set_pending_dict (priv, xattr[i], local->pending, + i, LOCAL_FIRST); if (ret < 0) gf_log (this->name, GF_LOG_INFO, @@ -933,7 +1066,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) { if (local->optimistic_change_log) { afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); + this, 1, 0, xattr[i], + NULL); break; } @@ -944,7 +1078,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->fxattrop, local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); else STACK_WIND_COOKIE (frame, afr_changelog_pre_op_cbk, @@ -952,7 +1087,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->xattrop, &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); } break; } @@ -960,7 +1096,7 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) if (!--call_count) break; } - +out: for (i = 0; i < priv->child_count; i++) { dict_unref (xattr[i]); } @@ -1114,12 +1250,14 @@ int afr_set_transaction_flock (afr_local_t *local) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; int_lock = &local->internal_lock; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - int_lock->lk_flock.l_len = local->transaction.len; - int_lock->lk_flock.l_start = local->transaction.start; - int_lock->lk_flock.l_type = F_WRLCK; + inodelk->flock.l_len = local->transaction.len; + inodelk->flock.l_start = local->transaction.start; + inodelk->flock.l_type = F_WRLCK; return 0; } @@ -1134,6 +1272,7 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; int_lock->transaction_lk_type = AFR_TRANSACTION_LK; + int_lock->domain = this->name; switch (local->transaction.type) { case AFR_DATA_TRANSACTION: @@ -1147,8 +1286,8 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this) case AFR_ENTRY_RENAME_TRANSACTION: - int_lock->lock_cbk = afr_post_blocking_rename_cbk; - afr_blocking_lock (frame, this); + int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk; + afr_nonblocking_entrylk (frame, this); break; case AFR_ENTRY_TRANSACTION: @@ -1170,12 +1309,6 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this) int afr_lock (call_frame_t *frame, xlator_t *this) { - afr_pid_save (frame); - - frame->root->pid = (long) frame->root; - - afr_set_lk_owner (frame, this); - afr_set_lock_number (frame, this); return afr_lock_rec (frame, this); @@ -1187,28 +1320,463 @@ afr_lock (call_frame_t *frame, xlator_t *this) int afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + if (__fop_changelog_needed (frame, this)) { + afr_changelog_pre_op (frame, this); + } else { + afr_transaction_perform_fop (frame, this); + } + + return 0; +} + + +void +afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + /* call this function from any of the related optimizations + which benefit from delaying post op are enabled, namely: + + - changelog piggybacking + - eager locking + */ + + priv = this->private; + if (!priv) + return; + + if (!priv->post_op_delay_secs) + return; + + local = frame->local; + if (!local->transaction.eager_lock_on) + return; + + if (!local) + return; + + if (!local->fd) + return; + + if (local->op == GF_FOP_WRITE) + local->delayed_post_op = _gf_true; +} + +gf_boolean_t +afr_are_multiple_fds_opened (inode_t *inode, xlator_t *this) +{ + afr_inode_ctx_t *ictx = NULL; + + if (!inode) { + /* If false is returned, it may keep on taking eager-lock + * which may lead to starvation, so return true to avoid that. + */ + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid inode"); + return _gf_true; + } + /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock + * is taken mount2 opened the same file, it won't be able to + * perform any data operations until mount1 releases eager-lock. + * To avoid such scenario do not enable eager-lock for this transaction + * if open-fd-count is > 1 + */ + + ictx = afr_inode_ctx_get (inode, this); + if (!ictx) + return _gf_true; + + if (ictx->open_fd_count > 1) + return _gf_true; + + return _gf_false; +} + +gf_boolean_t +afr_any_fops_failed (afr_local_t *local, afr_private_t *priv) +{ + if (local->success_count != priv->child_count) + return _gf_true; + return _gf_false; +} + +gf_boolean_t +is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + gf_boolean_t res = _gf_false; + afr_private_t *priv = NULL; priv = this->private; + local = frame->local; + if (!local) + goto out; + + if (!local->delayed_post_op) + goto out; + + //Mark pending changelog ASAP + if (afr_any_fops_failed (local, priv)) + goto out; + + if (local->fd && afr_are_multiple_fds_opened (local->fd->inode, this)) + goto out; + + res = _gf_true; +out: + return res; +} - if (__changelog_needed_pre_op (frame, this)) { - afr_changelog_pre_op (frame, this); - } else { - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); - afr_pid_restore (frame); +void +afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, + call_stub_t *stub); + +void +afr_delayed_changelog_wake_up_cbk (void *data) +{ + fd_t *fd = NULL; - local->transaction.fop (frame, this); + fd = data; + + afr_delayed_changelog_wake_up (THIS, fd); +} + + +/* + Check if the frame is destined to get optimized away + with changelog piggybacking +*/ +static gf_boolean_t +is_piggyback_post_op (call_frame_t *frame, fd_t *fd) +{ + afr_fd_ctx_t *fdctx = NULL; + afr_local_t *local = NULL; + gf_boolean_t piggyback = _gf_true; + afr_private_t *priv = NULL; + int i = 0; + + priv = frame->this->private; + local = frame->local; + fdctx = afr_fd_ctx_get (fd, frame->this); + + LOCK(&fd->lock); + { + piggyback = _gf_true; + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; + if (fdctx->pre_op_piggyback[i]) { + fdctx->pre_op_piggyback[i]--; + local->transaction.postop_piggybacked[i] = 1; + } else { + /* For at least _one_ subvolume we cannot + piggyback on the changelog, and have to + perform a hard POST-OP and therefore fsync + if necesssary + */ + piggyback = _gf_false; + GF_ASSERT (fdctx->pre_op_done[i]); + fdctx->pre_op_done[i]--; + } + } + } + UNLOCK(&fd->lock); + + if (!afr_txn_nothing_failed (frame, frame->this)) { + /* something failed in this transaction, + we will be performing a hard post-op + */ + return _gf_false; } + return piggyback; +} + + +/* SET operation */ +int +afr_fd_report_unstable_write (xlator_t *this, fd_t *fd) +{ + afr_fd_ctx_t *fdctx = NULL; + + fdctx = afr_fd_ctx_get (fd, this); + + LOCK(&fd->lock); + { + fdctx->witnessed_unstable_write = _gf_true; + } + UNLOCK(&fd->lock); + return 0; } +/* TEST and CLEAR operation */ +gf_boolean_t +afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd) +{ + afr_fd_ctx_t *fdctx = NULL; + gf_boolean_t witness = _gf_false; + + fdctx = afr_fd_ctx_get (fd, this); + if (!fdctx) + return _gf_true; + + LOCK(&fd->lock); + { + if (fdctx->witnessed_unstable_write) { + witness = _gf_true; + fdctx->witnessed_unstable_write = _gf_false; + } + } + UNLOCK (&fd->lock); + + return witness; +} + int +afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + afr_private_t *priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (afr_fop_failed (op_ret, op_errno)) { + /* Failure of fsync() is as good as failure of previous + write(). So treat it like one. + */ + gf_log (this->name, GF_LOG_WARNING, + "fsync(%s) failed on subvolume %s. Transaction was %s", + uuid_utoa (local->fd->inode->gfid), + priv->children[child_index]->name, + gf_fop_list[local->op]); + + afr_transaction_fop_failed (frame, this, child_index); + } + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_changelog_post_op_now (frame, this); + + return 0; +} + + +int +afr_changelog_fsync (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + GF_UNUSED int ret = -1; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (!call_count) { + /* will go straight to unlock */ + afr_changelog_post_op_now (frame, this); + return 0; + } + + local->call_count = call_count; + + xdata = dict_new(); + if (xdata) + ret = dict_set_int32 (xdata, "batch-fsync", 1); + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; + + STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->fsync, local->fd, + 1, xdata); + if (!--call_count) + break; + } + + if (xdata) + dict_unref (xdata); + + return 0; +} + + + int +afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) { + afr_changelog_post_op_now (frame, this); + return 0; + } + + if (is_piggyback_post_op (frame, local->fd)) { + /* just detected that this post-op is about to + be optimized away as a new write() has + already piggybacked on this frame's changelog. + */ + afr_changelog_post_op_now (frame, this); + return 0; + } + + /* Calling afr_changelog_post_op_now() now will result in + issuing ->[f]xattrop(). + + Performing a hard POST-OP (->[f]xattrop() FOP) is a more + responsible operation that what it might appear on the surface. + + The changelog of a file (in the xattr of the file on the server) + stores information (pending count) about the state of the file + on the OTHER server. This changelog is blindly trusted, and must + therefore be updated in such a way it remains trustworthy. This + implies that decrementing the pending count (essentially "clearing + the dirty flag") must be done STRICTLY after we are sure that the + operation on the other server has reached stable storage. + + While the backend filesystem on that server will eventually flush + it to stable storage, we (being in userspace) have no mechanism + to get notified when the write became "stable". + + This means we need take matter into our own hands and issue an + fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES, + and get an acknowledgement for it. And we need to wait for the + fsync() acknowledgement before initiating the hard POST-OP. + + However if the FD itself was opened in O_SYNC or O_DSYNC then + we are already guaranteed that the writes were made stable as + part of the FOP itself. The same holds true for NFS stable + writes which happen on an anonymous FD with O_DSYNC or O_SYNC + flag set in the writev() @flags param. For all other write types, + mark a flag in the fdctx whenever an unstable write is witnessed. + */ + + if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) { + afr_changelog_post_op_now (frame, this); + return 0; + } + + /* Check whether users want durability and perform fsync/post-op + * accordingly. + */ + if (priv->ensure_durability) { + /* Time to fsync() */ + afr_changelog_fsync (frame, this); + } else { + afr_changelog_post_op_now (frame, this); + } + + return 0; +} + + +void +afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, + call_stub_t *stub) +{ + afr_fd_ctx_t *fd_ctx = NULL; + call_frame_t *prev_frame = NULL; + struct timespec delta = {0, }; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + goto out; + + delta.tv_sec = priv->post_op_delay_secs; + delta.tv_nsec = 0; + + pthread_mutex_lock (&fd_ctx->delay_lock); + { + prev_frame = fd_ctx->delay_frame; + fd_ctx->delay_frame = NULL; + if (fd_ctx->delay_timer) + gf_timer_call_cancel (this->ctx, fd_ctx->delay_timer); + fd_ctx->delay_timer = NULL; + if (!frame) + goto unlock; + fd_ctx->delay_timer = gf_timer_call_after (this->ctx, delta, + afr_delayed_changelog_wake_up_cbk, + fd); + fd_ctx->delay_frame = frame; + } +unlock: + pthread_mutex_unlock (&fd_ctx->delay_lock); + +out: + if (prev_frame) { + local = prev_frame->local; + local->transaction.resume_stub = stub; + afr_changelog_post_op_safe (prev_frame, this); + } else if (stub) { + call_resume (stub); + } +} + + +void +afr_changelog_post_op (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (is_afr_delayed_changelog_post_op_needed (frame, this)) + afr_delayed_changelog_post_op (this, frame, local->fd, NULL); + else + afr_changelog_post_op_safe (frame, this); +} + + + +/* Wake up the sleeping/delayed post-op, and also register + a stub to have it resumed after this transaction + completely finishes. + + The @stub gets saved in @local and gets resumed in + afr_local_cleanup() + */ +void +afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub) +{ + afr_delayed_changelog_post_op (this, NULL, fd, stub); +} + + +void +afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd) +{ + afr_delayed_changelog_post_op (this, NULL, fd, NULL); +} + + + int afr_transaction_resume (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; @@ -1219,7 +1787,20 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; priv = this->private; - if (__changelog_needed_post_op (frame, this)) { + if (local->transaction.eager_lock_on) { + /* We don't need to retain "local" in the + fd list anymore, writes to all subvols + are finished by now */ + LOCK (&local->fd->lock); + { + list_del_init (&local->transaction.eager_locked); + } + UNLOCK (&local->fd->lock); + } + + afr_restore_lk_owner (frame); + + if (__fop_changelog_needed (frame, this)) { afr_changelog_post_op (frame, this); } else { if (afr_lock_server_count (priv, local->transaction.type) == 0) { @@ -1239,7 +1820,8 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this) */ void -afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index) +afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, + int child_index) { afr_local_t * local = NULL; afr_private_t * priv = NULL; @@ -1248,7 +1830,89 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index priv = this->private; __mark_child_dead (local->pending, priv->child_count, - child_index, local->transaction.type); + child_index, local->transaction.type); +} + + + + static gf_boolean_t +afr_locals_overlap (afr_local_t *local1, afr_local_t *local2) +{ + uint64_t start1 = local1->transaction.start; + uint64_t start2 = local2->transaction.start; + uint64_t end1 = 0; + uint64_t end2 = 0; + + if (local1->transaction.len) + end1 = start1 + local1->transaction.len - 1; + else + end1 = ULLONG_MAX; + + if (local2->transaction.len) + end2 = start2 + local2->transaction.len - 1; + else + end2 = ULLONG_MAX; + + return ((end1 >= start2) && (end2 >= start1)); +} + +void +afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_fd_ctx_t *fdctx = NULL; + afr_local_t *each = NULL; + + priv = this->private; + + if (!local->fd) + return; + + if (local->transaction.type != AFR_DATA_TRANSACTION) + return; + + if (!priv->eager_lock) + return; + + fdctx = afr_fd_ctx_get (local->fd, this); + if (!fdctx) + return; + + if (afr_are_multiple_fds_opened (local->fd->inode, this)) + return; + /* + * Once full file lock is acquired in eager-lock phase, overlapping + * writes do not compete for inode-locks, instead are transferred to the + * next writes. Because of this overlapping writes are not ordered. + * This can cause inconsistencies in replication. + * Example: + * Two overlapping writes w1, w2 are sent in parallel on same fd + * in two threads t1, t2. + * Both threads can execute afr_writev_wind in the following manner. + * t1 winds w1 on brick-0 + * t2 winds w2 on brick-0 + * t2 winds w2 on brick-1 + * t1 winds w1 on brick-1 + * + * This check makes sure the locks are not transferred for + * overlapping writes. + */ + LOCK (&local->fd->lock); + { + list_for_each_entry (each, &fdctx->eager_locked, + transaction.eager_locked) { + if (afr_locals_overlap (each, local)) { + local->transaction.eager_lock_on = _gf_false; + goto unlock; + } + } + + local->transaction.eager_lock_on = _gf_true; + list_add_tail (&local->transaction.eager_locked, + &fdctx->eager_locked); + } +unlock: + UNLOCK (&local->fd->lock); } @@ -1257,20 +1921,43 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) { afr_local_t * local = NULL; afr_private_t * priv = NULL; + fd_t *fd = NULL; + int ret = -1; local = frame->local; priv = this->private; - afr_transaction_local_init (local, this); - local->transaction.resume = afr_transaction_resume; local->transaction.type = type; + ret = afr_transaction_local_init (local, this); + if (ret < 0) + goto out; + + afr_transaction_eager_lock_init (local, this); + + if (local->fd && local->transaction.eager_lock_on) + afr_set_lk_owner (frame, this, local->fd); + else + afr_set_lk_owner (frame, this, frame->root); + + if (!local->transaction.eager_lock_on && local->loc.inode) { + fd = fd_lookup (local->loc.inode, frame->root->pid); + if (fd == NULL) + fd = fd_lookup_anonymous (local->loc.inode); + + if (fd) { + afr_delayed_changelog_wake_up (this, fd); + fd_unref (fd); + } + } + if (afr_lock_server_count (priv, local->transaction.type) == 0) { afr_internal_lock_finish (frame, this); } else { afr_lock (frame, this); } - - return 0; + ret = 0; +out: + return ret; } diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index 10f274fec..fa626fd0d 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -1,25 +1,21 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __TRANSACTION_H__ #define __TRANSACTION_H__ +typedef enum { + LOCAL_FIRST = 1, + LOCAL_LAST = 2 +} afr_xattrop_type_t; + void afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index); @@ -27,9 +23,29 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int afr_lock_server_count (afr_private_t *priv, afr_transaction_type type); +afr_inodelk_t* +afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom); + int32_t afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); afr_fd_ctx_t * afr_fd_ctx_get (fd_t *fd, xlator_t *this); +int +afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, + int child, afr_xattrop_type_t op); +void +afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this); + +void +afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd); + +void +__mark_all_success (int32_t *pending[], int child_count, + afr_transaction_type type); +gf_boolean_t +afr_any_fops_failed (afr_local_t *local, afr_private_t *priv); + +gf_boolean_t +afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this); #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 8bb94e205..c724eb2ae 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -30,6 +21,11 @@ #endif #include "afr-common.c" +#define SHD_INODE_LRU_LIMIT 2048 +#define AFR_EH_HEALED_LIMIT 1024 +#define AFR_EH_HEAL_FAIL_LIMIT 1024 +#define AFR_EH_SPLIT_BRAIN_LIMIT 1024 + struct volume_options options[]; int32_t @@ -37,8 +33,13 @@ notify (xlator_t *this, int32_t event, void *data, ...) { int ret = -1; + va_list ap; + void *data2 = NULL; - ret = afr_notify (this, event, data); + va_start (ap, data); + data2 = va_arg (ap, dict_t*); + va_end (ap); + ret = afr_notify (this, event, data, data2); return ret; } @@ -85,26 +86,31 @@ xlator_subvolume_index (xlator_t *this, xlator_t *subvol) return index; } - -int -xlator_subvolume_count (xlator_t *this) +void +fix_quorum_options (xlator_t *this, afr_private_t *priv, char *qtype) { - int i = 0; - xlator_list_t *list = NULL; - - for (list = this->children; list; list = list->next) - i++; - return i; + if (priv->quorum_count && strcmp(qtype,"fixed")) { + gf_log(this->name,GF_LOG_WARNING, + "quorum-type %s overriding quorum-count %u", + qtype, priv->quorum_count); + } + if (!strcmp(qtype,"none")) { + priv->quorum_count = 0; + } + else if (!strcmp(qtype,"auto")) { + priv->quorum_count = AFR_QUORUM_AUTO; + } } - int reconfigure (xlator_t *this, dict_t *options) { - afr_private_t * priv = NULL; - xlator_t * read_subvol = NULL; - int ret = -1; - int index = -1; + afr_private_t *priv = NULL; + xlator_t *read_subvol = NULL; + int read_subvol_index = -1; + int ret = -1; + int index = -1; + char *qtype = NULL; priv = this->private; @@ -144,6 +150,9 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out); + GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode, + options, uint32, out); + if (read_subvol) { index = xlator_subvolume_index (this, read_subvol); if (index == -1) { @@ -154,6 +163,38 @@ reconfigure (xlator_t *this, dict_t *options) priv->read_child = index; } + GF_OPTION_RECONF ("read-subvolume-index",read_subvol_index, options,int32,out); + + if (read_subvol_index >-1) { + index=read_subvol_index; + if (index >= priv->child_count) { + gf_log (this->name, GF_LOG_ERROR, "%d not a subvolume-index", + index); + goto out; + } + priv->read_child = index; + } + + GF_OPTION_RECONF ("eager-lock", priv->eager_lock, options, bool, out); + GF_OPTION_RECONF ("quorum-type", qtype, options, str, out); + GF_OPTION_RECONF ("quorum-count", priv->quorum_count, options, + uint32, out); + fix_quorum_options(this,priv,qtype); + GF_OPTION_RECONF ("heal-timeout", priv->shd.timeout, options, + int32, out); + + GF_OPTION_RECONF ("post-op-delay-secs", priv->post_op_delay_secs, options, + uint32, out); + + GF_OPTION_RECONF (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, + options, size, out); + /* Reset this so we re-discover in case the topology changed. */ + GF_OPTION_RECONF ("readdir-failover", priv->readdir_failover, options, + bool, out); + GF_OPTION_RECONF ("ensure-durability", priv->ensure_durability, options, + bool, out); + priv->did_discovery = _gf_false; + ret = 0; out: return ret; @@ -173,15 +214,16 @@ static const char *favorite_child_warning_str = "You have specified subvolume '% int32_t init (xlator_t *this) { - afr_private_t * priv = NULL; - int child_count = 0; - xlator_list_t * trav = NULL; - int i = 0; - int ret = -1; - int op_errno = 0; - xlator_t * read_subvol = NULL; - xlator_t * fav_child = NULL; - + afr_private_t *priv = NULL; + int child_count = 0; + xlator_list_t *trav = NULL; + int i = 0; + int ret = -1; + GF_UNUSED int op_errno = 0; + xlator_t *read_subvol = NULL; + int read_subvol_index = -1; + xlator_t *fav_child = NULL; + char *qtype = NULL; if (!this->children) { gf_log (this->name, GF_LOG_ERROR, @@ -195,9 +237,21 @@ init (xlator_t *this) "Volume is dangling."); } - ALLOC_OR_GOTO (this->private, afr_private_t, out); + this->private = GF_CALLOC (1, sizeof (afr_private_t), + gf_afr_mt_afr_private_t); + if (!this->private) + goto out; priv = this->private; + LOCK_INIT (&priv->lock); + LOCK_INIT (&priv->read_child_lock); + //lock recovery is not done in afr + pthread_mutex_init (&priv->mutex, NULL); + INIT_LIST_HEAD (&priv->saved_fds); + + child_count = xlator_subvolume_count (this); + + priv->child_count = child_count; priv->read_child = -1; @@ -210,6 +264,18 @@ init (xlator_t *this) goto out; } } + GF_OPTION_INIT ("read-subvolume-index",read_subvol_index,int32,out); + if (read_subvol_index > -1) { + if (read_subvol_index >= priv->child_count) { + gf_log (this->name, GF_LOG_ERROR, "%d not a subvolume-index", + read_subvol_index); + goto out; + } + priv->read_child = read_subvol_index; + } + GF_OPTION_INIT ("choose-local", priv->choose_local, bool, out); + + GF_OPTION_INIT ("read-hash-mode", priv->hash_mode, uint32, out); priv->favorite_child = -1; GF_OPTION_INIT ("favorite-child", fav_child, xlator, out); @@ -244,6 +310,8 @@ init (xlator_t *this) GF_OPTION_INIT ("self-heal-daemon", priv->shd.enabled, bool, out); + GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out); + GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out); GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool, @@ -260,14 +328,19 @@ init (xlator_t *this) GF_OPTION_INIT ("strict-readdir", priv->strict_readdir, bool, out); - priv->wait_count = 1; - - child_count = xlator_subvolume_count (this); + GF_OPTION_INIT ("eager-lock", priv->eager_lock, bool, out); + GF_OPTION_INIT ("quorum-type", qtype, str, out); + GF_OPTION_INIT ("quorum-count", priv->quorum_count, uint32, out); + GF_OPTION_INIT (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size, + out); + fix_quorum_options(this,priv,qtype); - priv->child_count = child_count; + GF_OPTION_INIT ("post-op-delay-secs", priv->post_op_delay_secs, uint32, out); + GF_OPTION_INIT ("readdir-failover", priv->readdir_failover, bool, out); + GF_OPTION_INIT ("ensure-durability", priv->ensure_durability, bool, + out); - LOCK_INIT (&priv->lock); - LOCK_INIT (&priv->read_child_lock); + priv->wait_count = 1; priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, gf_afr_mt_char); @@ -307,8 +380,6 @@ init (xlator_t *this) AFR_XATTR_PREFIX, trav->xlator->name); if (-1 == ret) { - gf_log (this->name, GF_LOG_ERROR, - "asprintf failed to set pending key"); ret = -ENOMEM; goto out; } @@ -317,6 +388,13 @@ init (xlator_t *this) i++; } + ret = gf_asprintf (&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT, + this->name); + if (-1 == ret) { + ret = -ENOMEM; + goto out; + } + priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event), gf_afr_mt_int32_t); if (!priv->last_event) { @@ -324,20 +402,67 @@ init (xlator_t *this) goto out; } - priv->shd.pos = GF_CALLOC (sizeof (*priv->shd.pos), child_count, - gf_afr_mt_afr_brick_pos_t); - if (!priv->shd.pos) { - ret = -ENOMEM; + /* keep more local here as we may need them for self-heal etc */ + this->local_pool = mem_pool_new (afr_local_t, 512); + if (!this->local_pool) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); goto out; } - LOCK_INIT (&priv->root_inode_lk); priv->first_lookup = 1; priv->root_inode = NULL; - pthread_mutex_init (&priv->mutex, NULL); - INIT_LIST_HEAD (&priv->saved_fds); + if (!priv->shd.iamshd) { + ret = 0; + goto out; + } + + ret = -ENOMEM; + priv->shd.pos = GF_CALLOC (sizeof (*priv->shd.pos), child_count, + gf_afr_mt_brick_pos_t); + if (!priv->shd.pos) + goto out; + + priv->shd.pending = GF_CALLOC (sizeof (*priv->shd.pending), child_count, + gf_afr_mt_int32_t); + if (!priv->shd.pending) + goto out; + + priv->shd.inprogress = GF_CALLOC (sizeof (*priv->shd.inprogress), + child_count, gf_afr_mt_shd_bool_t); + if (!priv->shd.inprogress) + goto out; + priv->shd.timer = GF_CALLOC (sizeof (*priv->shd.timer), child_count, + gf_afr_mt_shd_timer_t); + if (!priv->shd.timer) + goto out; + + priv->shd.healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false, + _destroy_shd_event_data); + if (!priv->shd.healed) + goto out; + + priv->shd.heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false, + _destroy_shd_event_data); + if (!priv->shd.heal_failed) + goto out; + priv->shd.split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false, + _destroy_shd_event_data); + if (!priv->shd.split_brain) + goto out; + + this->itable = inode_table_new (SHD_INODE_LRU_LIMIT, this); + if (!this->itable) + goto out; + priv->root_inode = inode_ref (this->itable->root); + GF_OPTION_INIT ("node-uuid", priv->shd.node_uuid, str, out); + GF_OPTION_INIT ("heal-timeout", priv->shd.timeout, int32, out); + ret = afr_initialise_statistics (this); + if (ret) + goto out; ret = 0; out: return ret; @@ -347,6 +472,13 @@ out: int fini (xlator_t *this) { + afr_private_t *priv = NULL; + + priv = this->private; + this->private = NULL; + afr_priv_destroy (priv); + if (this->itable);//I dont see any destroy func + return 0; } @@ -365,6 +497,9 @@ struct xlator_fops fops = { .finodelk = afr_finodelk, .entrylk = afr_entrylk, .fentrylk = afr_fentrylk, + .fallocate = afr_fallocate, + .discard = afr_discard, + .zerofill = afr_zerofill, /* inode read */ .access = afr_access, @@ -372,6 +507,7 @@ struct xlator_fops fops = { .fstat = afr_fstat, .readlink = afr_readlink, .getxattr = afr_getxattr, + .fgetxattr = afr_fgetxattr, .readv = afr_readv, /* inode write */ @@ -379,9 +515,11 @@ struct xlator_fops fops = { .truncate = afr_truncate, .ftruncate = afr_ftruncate, .setxattr = afr_setxattr, + .fsetxattr = afr_fsetxattr, .setattr = afr_setattr, .fsetattr = afr_fsetattr, .removexattr = afr_removexattr, + .fremovexattr = afr_fremovexattr, /* dir read */ .opendir = afr_opendir, @@ -414,33 +552,79 @@ struct xlator_cbks cbks = { struct volume_options options[] = { { .key = {"read-subvolume" }, - .type = GF_OPTION_TYPE_XLATOR + .type = GF_OPTION_TYPE_XLATOR, + .description = "inode-read fops happen only on one of the bricks in " + "replicate. Afr will prefer the one specified using " + "this option if it is not stale. Option value must be " + "one of the xlator names of the children. " + "Ex: <volname>-client-0 till " + "<volname>-client-<number-of-bricks - 1>" + }, + { .key = {"read-subvolume-index" }, + .type = GF_OPTION_TYPE_INT, + .default_value = "-1", + .description = "inode-read fops happen only on one of the bricks in " + "replicate. AFR will prefer the one specified using " + "this option if it is not stale. allowed options" + " include -1 till replica-count - 1" + }, + { .key = {"read-hash-mode" }, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = 2, + .default_value = "0", + .description = "inode-read fops happen only on one of the bricks in " + "replicate. AFR will prefer the one computed using " + "the method specified using this option" + "0 = first responder, " + "1 = hash by GFID of file (all clients use " + "same subvolume), " + "2 = hash by GFID of file and client PID", + }, + { .key = {"choose-local" }, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .description = "Choose a local subvolume(i.e. Brick) to read from if " + "read-subvolume is not explicitly set.", }, { .key = {"favorite-child"}, - .type = GF_OPTION_TYPE_XLATOR + .type = GF_OPTION_TYPE_XLATOR, + .description = "If a split-brain happens choose subvol/brick set by " + "this option as source." }, { .key = {"background-self-heal-count"}, .type = GF_OPTION_TYPE_INT, .min = 0, .default_value = "16", + .validate = GF_OPT_VALIDATE_MIN, + .description = "This specifies the number of self-heals that can be " + " performed in background without blocking the fop" }, { .key = {"data-self-heal"}, .type = GF_OPTION_TYPE_STR, - .default_value = "", .value = {"1", "on", "yes", "true", "enable", "0", "off", "no", "false", "disable", "open"}, .default_value = "on", + .description = "Using this option we can enable/disable data " + "self-heal on the file. \"open\" means data " + "self-heal action will only be triggered by file " + "open operations." }, { .key = {"data-self-heal-algorithm"}, .type = GF_OPTION_TYPE_STR, - .default_value = "", .description = "Select between \"full\", \"diff\". The " "\"full\" algorithm copies the entire file from " "source to sink. The \"diff\" algorithm copies to " "sink only those blocks whose checksums don't match " - "with those of source.", - .value = { "diff", "full", "" } + "with those of source. If no option is configured " + "the option is chosen dynamically as follows: " + "If the file does not exist on one of the sinks " + "or empty file exists or if the source file size is " + "about the same as page size the entire file will " + "be read and written i.e \"full\" algo, " + "otherwise \"diff\" algo is chosen.", + .value = { "diff", "full"} }, { .key = {"data-self-heal-window-size"}, .type = GF_OPTION_TYPE_INT, @@ -453,26 +637,43 @@ struct volume_options options[] = { { .key = {"metadata-self-heal"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", + .description = "Using this option we can enable/disable metadata " + "i.e. Permissions, ownerships, xattrs self-heal on " + "the file/directory." }, { .key = {"entry-self-heal"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", + .description = "Using this option we can enable/disable entry " + "self-heal on the directory." }, { .key = {"data-change-log"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", + .description = "Data fops like write/truncate will not perform " + "pre/post fop changelog operations in afr transaction " + "if this option is disabled" }, { .key = {"metadata-change-log"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", + .description = "Metadata fops like setattr/setxattr will not perform " + "pre/post fop changelog operations in afr transaction " + "if this option is disabled" }, { .key = {"entry-change-log"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", + .description = "Entry fops like create/unlink will not perform " + "pre/post fop changelog operations in afr transaction " + "if this option is disabled" }, { .key = {"optimistic-change-log"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", + .description = "Entry/Metadata fops will not perform " + "pre fop changelog operations in afr transaction " + "if this option is enabled." }, { .key = {"strict-readdir"}, .type = GF_OPTION_TYPE_BOOL, @@ -481,14 +682,112 @@ struct volume_options options[] = { { .key = {"inodelk-trace"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", + .description = "Enabling this option logs inode lock/unlocks" }, { .key = {"entrylk-trace"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", + .description = "Enabling this option logs entry lock/unlocks" + }, + { .key = {"eager-lock"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Lock phase of a transaction has two sub-phases. " + "First is an attempt to acquire locks in parallel by " + "broadcasting non-blocking lock requests. If lock " + "aquistion fails on any server, then the held locks " + "are unlocked and revert to a blocking locked mode " + "sequentially on one server after another. If this " + "option is enabled the initial broadcasting lock " + "request attempt to acquire lock on the entire file. " + "If this fails, we revert back to the sequential " + "\"regional\" blocking lock as before. In the case " + "where such an \"eager\" lock is granted in the " + "non-blocking phase, it gives rise to an opportunity " + "for optimization. i.e, if the next write transaction " + "on the same FD arrives before the unlock phase of " + "the first transaction, it \"takes over\" the full " + "file lock. Similarly if yet another data transaction " + "arrives before the unlock phase of the \"optimized\" " + "transaction, that in turn \"takes over\" the lock as " + "well. The actual unlock now happens at the end of " + "the last \"optimzed\" transaction." + }, { .key = {"self-heal-daemon"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", + .description = "This option applies to only self-heal-daemon. " + "Index directory crawl and automatic healing of files" + "will not be performed if this option is turned off." + }, + { .key = {"iam-self-heal-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option differentiates if the replicate " + "translator is running as part of self-heal-daemon " + "or not." + }, + { .key = {"quorum-type"}, + .type = GF_OPTION_TYPE_STR, + .value = { "none", "auto", "fixed"}, + .default_value = "none", + .description = "If value is \"fixed\" only allow writes if " + "quorum-count bricks are present. If value is " + "\"auto\" only allow writes if more than half of " + "bricks, or exactly half including the first, are " + "present.", + }, + { .key = {"quorum-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = INT_MAX, + .default_value = 0, + .description = "If quorum-type is \"fixed\" only allow writes if " + "this many bricks or present. Other quorum types " + "will OVERWRITE this value.", + }, + { .key = {"node-uuid"}, + .type = GF_OPTION_TYPE_STR, + .description = "Local glusterd uuid string, used in starting " + "self-heal-daemon so that it can crawl only on " + "local index directories.", + }, + { .key = {"heal-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 60, + .max = INT_MAX, + .default_value = "600", + .description = "time interval for checking the need to self-heal " + "in self-heal-daemon" + }, + { .key = {"post-op-delay-secs"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = INT_MAX, + .default_value = "1", + .description = "Time interval induced artificially before " + "post-operation phase of the transaction to " + "enhance overlap of adjacent write operations.", + }, + { .key = {AFR_SH_READDIR_SIZE_KEY}, + .type = GF_OPTION_TYPE_SIZET, + .description = "readdirp size for performing entry self-heal", + .min = 1024, + .max = 131072, + .default_value = "1KB", + }, + { .key = {"readdir-failover"}, + .type = GF_OPTION_TYPE_BOOL, + .description = "readdir(p) will not failover if this option is off", + .default_value = "on", + }, + { .key = {"ensure-durability"}, + .type = GF_OPTION_TYPE_BOOL, + .description = "Afr performs fsyncs for transactions if this " + "option is on to make sure the changelogs/data is " + "written to the disk", + .default_value = "on", }, { .key = {NULL} }, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 92ccf607f..21064db58 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -32,9 +23,15 @@ #include "afr-self-heal-algorithm.h" #include "libxlator.h" +#include "timer.h" #define AFR_XATTR_PREFIX "trusted.afr" #define AFR_PATHINFO_HEADER "REPLICATE:" +#define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size" +#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal" + +#define AFR_LOCKEE_COUNT_MAX 3 +#define AFR_DOM_COUNT_MAX 3 struct _pump_private; @@ -43,11 +40,12 @@ typedef int (*afr_expunge_done_cbk_t) (call_frame_t *frame, xlator_t *this, int32_t op_errno); typedef int (*afr_impunge_done_cbk_t) (call_frame_t *frame, xlator_t *this, - int child, int32_t op_error, - int32_t op_errno); + int32_t op_error, int32_t op_errno); typedef int (*afr_post_remove_call_t) (call_frame_t *frame, xlator_t *this); typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this); +typedef void (*afr_lookup_done_cbk_t) (call_frame_t *frame, xlator_t *this, + int32_t op_ret, int32_t op_errno); typedef enum { AFR_POS_UNKNOWN, @@ -56,13 +54,16 @@ typedef enum { } afr_child_pos_t; typedef enum { + SPLIT_BRAIN = 1, + ALL_FOOLS = 2 +} afr_subvol_status_t; + +typedef enum { AFR_INODE_SET_READ_CTX = 1, AFR_INODE_RM_STALE_CHILDREN, AFR_INODE_SET_OPENDIR_DONE, - AFR_INODE_SET_SPLIT_BRAIN, AFR_INODE_GET_READ_CTX, AFR_INODE_GET_OPENDIR_DONE, - AFR_INODE_GET_SPLIT_BRAIN, } afr_inode_op_t; typedef struct afr_inode_params_ { @@ -76,16 +77,41 @@ typedef struct afr_inode_params_ { } u; } afr_inode_params_t; +typedef enum afr_spb_state { + DONT_KNOW, + SPB, + NO_SPB +} afr_spb_state_t; + typedef struct afr_inode_ctx_ { uint64_t masks; int32_t *fresh_children;//increasing order of latency + afr_spb_state_t mdata_spb; + afr_spb_state_t data_spb; + uint32_t open_fd_count; } afr_inode_ctx_t; +typedef enum { + NONE, + INDEX, + INDEX_TO_BE_HEALED, + FULL, +} afr_crawl_type_t; + typedef struct afr_self_heald_ { - gf_boolean_t enabled; - gf_boolean_t pending; - gf_boolean_t inprogress; - afr_child_pos_t *pos; + gf_boolean_t enabled; + gf_boolean_t iamshd; + afr_crawl_type_t *pending; + gf_boolean_t *inprogress; + afr_child_pos_t *pos; + gf_timer_t **timer; + eh_t *healed; + eh_t *heal_failed; + eh_t *split_brain; + eh_t **statistics; + void **crawl_events; + char *node_uuid; + int timeout; } afr_self_heald_t; typedef struct _afr_private { @@ -97,7 +123,6 @@ typedef struct _afr_private { xlator_t **children; - gf_lock_t root_inode_lk; int first_lookup; inode_t *root_inode; @@ -120,13 +145,10 @@ typedef struct _afr_private { gf_boolean_t entry_change_log; /* on/off */ int read_child; /* read-subvolume */ + unsigned int hash_mode; /* for when read_child is not set */ int favorite_child; /* subvolume to be preferred in resolving split-brain cases */ - unsigned int data_lock_server_count; - unsigned int metadata_lock_server_count; - unsigned int entry_lock_server_count; - gf_boolean_t inodelk_trace; gf_boolean_t entrylk_trace; @@ -142,15 +164,50 @@ typedef struct _afr_private { pthread_mutex_t mutex; struct list_head saved_fds; /* list of fds on which locks have succeeded */ - gf_boolean_t optimistic_change_log; - gf_boolean_t eager_lock; + gf_boolean_t optimistic_change_log; + gf_boolean_t eager_lock; + uint32_t post_op_delay_secs; + unsigned int quorum_count; char vol_uuid[UUID_SIZE + 1]; int32_t *last_event; afr_self_heald_t shd; + gf_boolean_t choose_local; + gf_boolean_t did_discovery; + gf_boolean_t readdir_failover; + uint64_t sh_readdir_size; + gf_boolean_t ensure_durability; + char *sh_domain; } afr_private_t; +typedef enum { + AFR_SELF_HEAL_NOT_ATTEMPTED, + AFR_SELF_HEAL_STARTED, + AFR_SELF_HEAL_FAILED, + AFR_SELF_HEAL_SYNC_BEGIN, +} afr_self_heal_status; + typedef struct { + afr_self_heal_status gfid_or_missing_entry_self_heal; + afr_self_heal_status metadata_self_heal; + afr_self_heal_status data_self_heal; + afr_self_heal_status entry_self_heal; +} afr_sh_status_for_all_type; + +typedef enum { + AFR_SELF_HEAL_ENTRY, + AFR_SELF_HEAL_METADATA, + AFR_SELF_HEAL_DATA, + AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY, + AFR_SELF_HEAL_INVALID = -1, +} afr_self_heal_type; + +typedef enum { + AFR_CHECK_ALL, + AFR_CHECK_SPECIFIC, +} afr_sh_fail_check_type; + +struct afr_self_heal_ { /* External interface: These are variables (some optional) that are set by whoever has triggered self-heal */ @@ -159,6 +216,8 @@ typedef struct { gf_boolean_t do_entry_self_heal; gf_boolean_t do_gfid_self_heal; gf_boolean_t do_missing_entry_self_heal; + gf_boolean_t force_confirm_spb; /* Check for split-brains even when + self-heal is turned off */ gf_boolean_t forced_merge; /* Is this a self-heal triggered to forcibly merge the directories? */ @@ -176,7 +235,7 @@ typedef struct { background, this function will be called as soon as possible. */ int (*unwind) (call_frame_t *frame, xlator_t *this, int32_t op_ret, - int32_t op_errno); + int32_t op_errno, int32_t sh_failed); /* End of external interface members */ @@ -189,7 +248,6 @@ typedef struct { afr_expunge_done_cbk_t expunge_done; afr_impunge_done_cbk_t impunge_done; - int32_t impunge_ret_child; /* array of xattr's, one for each child */ dict_t **xattr; @@ -204,6 +262,10 @@ typedef struct { int32_t *fresh_parent_dirs; /* array of errno's, one for each child */ int *child_errno; + /*loc used for lookup*/ + loc_t lookup_loc; + int32_t lookup_flags; + afr_lookup_done_cbk_t lookup_done; int32_t **pending_matrix; int32_t **delta_matrix; @@ -219,12 +281,13 @@ typedef struct { unsigned char *locked_nodes; int lock_count; - mode_t impunging_entry_mode; const char *linkname; + gf_boolean_t entries_skipped; - int op_failed; - + gf_boolean_t actual_sh_started; + gf_boolean_t sync_done; gf_boolean_t data_lock_held; + gf_boolean_t sh_dom_lock_held; gf_boolean_t eof_reached; fd_t *healing_fd; int file_has_holes; @@ -235,25 +298,32 @@ typedef struct { uint8_t *checksum; afr_post_remove_call_t post_remove_call; - loc_t parent_loc; + char *data_sh_info; + char *metadata_sh_info; + loc_t parent_loc; call_frame_t *orig_frame; call_frame_t *old_loop_frame; gf_boolean_t unwound; afr_sh_algo_private_t *private; + afr_sh_status_for_all_type afr_all_sh_status; + afr_self_heal_type sh_type_in_action; + struct afr_sh_algorithm *algo; afr_lock_cbk_t data_lock_success_handler; afr_lock_cbk_t data_lock_failure_handler; + gf_boolean_t data_lock_block; int (*completion_cbk) (call_frame_t *frame, xlator_t *this); int (*sh_data_algo_start) (call_frame_t *frame, xlator_t *this); int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this); - afr_lock_cbk_t loop_completion_cbk; int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this); void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this); call_frame_t *sh_frame; -} afr_self_heal_t; +}; + +typedef struct afr_self_heal_ afr_self_heal_t; typedef enum { AFR_DATA_TRANSACTION, /* truncate, write, ... */ @@ -315,11 +385,31 @@ afr_index_for_transaction_type (afr_transaction_type type) return -1; /* make gcc happy */ } +typedef struct { + loc_t loc; + char *basename; + unsigned char *locked_nodes; + int locked_count; + +} afr_entry_lockee_t; + +int +afr_entry_lockee_cmp (const void *l1, const void *l2); + +typedef struct { + char *domain; /* Domain on which inodelk is taken */ + struct gf_flock flock; + unsigned char *locked_nodes; + int32_t lock_count; +} afr_inodelk_t; typedef struct { loc_t *lk_loc; - struct gf_flock lk_flock; + int lockee_count; + afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX]; + + afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX]; const char *lk_basename; const char *lower_basename; const char *higher_basename; @@ -328,23 +418,22 @@ typedef struct { unsigned char *locked_nodes; unsigned char *lower_locked_nodes; - unsigned char *inode_locked_nodes; - unsigned char *entry_locked_nodes; selfheal_lk_type_t selfheal_lk_type; transaction_lk_type_t transaction_lk_type; int32_t lock_count; - int32_t inodelk_lock_count; int32_t entrylk_lock_count; uint64_t lock_number; int32_t lk_call_count; int32_t lk_expected_count; + int32_t lk_attempted_count; int32_t lock_op_ret; int32_t lock_op_errno; afr_lock_cbk_t lock_cbk; + char *domain; /* Domain on which inode/entry lock/unlock in progress.*/ } afr_internal_lock_t; typedef struct _afr_locked_fd { @@ -352,21 +441,29 @@ typedef struct _afr_locked_fd { struct list_head list; } afr_locked_fd_t; +struct afr_reply { + int valid; + int32_t op_ret; + int32_t op_errno; +}; + typedef struct _afr_local { int uid; int gid; unsigned int call_count; unsigned int success_count; unsigned int enoent_count; + uint32_t open_fd_count; + gf_boolean_t update_open_fd_count; - unsigned int govinda_gOvinda; + unsigned int unhealable; unsigned int read_child_index; unsigned char read_child_returned; unsigned int first_up_child; - pid_t saved_pid; + gf_lkowner_t saved_lk_owner; int32_t op_ret; int32_t op_errno; @@ -377,7 +474,6 @@ typedef struct _afr_local { loc_t newloc; fd_t *fd; - int32_t *fd_open_on; glusterfs_fop_t fop; @@ -399,13 +495,25 @@ typedef struct _afr_local { dict_t *dict; int optimistic_change_log; + gf_boolean_t delayed_post_op; + + + /* Is the current writev() going to perform a stable write? + i.e, is fd->flags or @flags writev param have O_SYNC or + O_DSYNC? + */ + gf_boolean_t stable_write; + + /* This write appended to the file. Nnot necessarily O_APPEND, + just means the offset of write was at the end of file. + */ + gf_boolean_t append_write; + + int allow_sh_for_running_transaction; - gf_boolean_t fop_paused; - int (*fop_call_continue) (call_frame_t *frame, xlator_t *this); - /* - This struct contains the arguments for the "continuation" - (scheme-like) of fops + /* This struct contains the arguments for the "continuation" + (scheme-like) of fops */ int op; @@ -416,13 +524,11 @@ typedef struct _afr_local { } statfs; struct { + uint32_t parent_entrylk; uuid_t gfid_req; inode_t *inode; struct iatt buf; struct iatt postparent; - ino_t ino; - uint64_t gen; - ino_t parent_ino; dict_t **xattrs; dict_t *xattr; struct iatt *postparents; @@ -430,11 +536,13 @@ typedef struct _afr_local { int32_t read_child; int32_t *sources; int32_t *success_children; + int32_t **pending_matrix; + gf_boolean_t fresh_lookup; + gf_boolean_t possible_spb; } lookup; struct { int32_t flags; - int32_t wbflags; } open; struct { @@ -453,31 +561,28 @@ typedef struct _afr_local { struct { int last_index; - ino_t ino; } stat; struct { int last_index; - ino_t ino; } fstat; struct { size_t size; int last_index; - ino_t ino; } readlink; struct { char *name; int last_index; - long pathinfo_len; + long xattr_len; } getxattr; struct { - ino_t ino; size_t size; off_t offset; int last_index; + uint32_t flags; } readv; /* dir read */ @@ -495,59 +600,43 @@ typedef struct _afr_local { int32_t op_errno; size_t size; off_t offset; - + dict_t *dict; gf_boolean_t failed; int last_index; } readdir; /* inode write */ struct { - ino_t ino; struct iatt prebuf; struct iatt postbuf; + } inode_wfop; //common structure for all inode-write-fops + struct { int32_t op_ret; struct iovec *vector; struct iobref *iobref; int32_t count; off_t offset; + uint32_t flags; } writev; struct { - ino_t ino; - struct iatt prebuf; - struct iatt postbuf; - } fsync; - - struct { - ino_t ino; off_t offset; - struct iatt prebuf; - struct iatt postbuf; } truncate; struct { - ino_t ino; off_t offset; - struct iatt prebuf; - struct iatt postbuf; } ftruncate; struct { - ino_t ino; struct iatt in_buf; int32_t valid; - struct iatt preop_buf; - struct iatt postop_buf; } setattr; struct { - ino_t ino; struct iatt in_buf; int32_t valid; - struct iatt preop_buf; - struct iatt postop_buf; } fsetattr; struct { @@ -556,116 +645,85 @@ typedef struct _afr_local { } setxattr; struct { + dict_t *dict; + int32_t flags; + } fsetxattr; + + struct { char *name; } removexattr; + struct { + dict_t *xattr; + } xattrop; + + struct { + dict_t *xattr; + } fxattrop; + /* dir write */ struct { - ino_t ino; - uint64_t gen; - ino_t parent_ino; - fd_t *fd; - dict_t *params; - int32_t flags; - mode_t mode; inode_t *inode; struct iatt buf; struct iatt preparent; struct iatt postparent; - struct iatt read_child_buf; + struct iatt prenewparent; + struct iatt postnewparent; + } dir_fop; //common structure for all dir fops + + struct { + fd_t *fd; + dict_t *params; + int32_t flags; + mode_t mode; } create; struct { - ino_t ino; - uint64_t gen; - ino_t parent_ino; dev_t dev; mode_t mode; dict_t *params; - inode_t *inode; - struct iatt buf; - struct iatt preparent; - struct iatt postparent; - struct iatt read_child_buf; } mknod; struct { - ino_t ino; - uint64_t gen; - ino_t parent_ino; int32_t mode; dict_t *params; - inode_t *inode; - struct iatt buf; - struct iatt read_child_buf; - struct iatt preparent; - struct iatt postparent; } mkdir; struct { - ino_t parent_ino; - int32_t op_ret; - int32_t op_errno; - struct iatt preparent; - struct iatt postparent; - } unlink; - - struct { - int flags; - ino_t parent_ino; - int32_t op_ret; - int32_t op_errno; - struct iatt preparent; - struct iatt postparent; + int flags; } rmdir; struct { - ino_t oldparent_ino; - ino_t newparent_ino; - ino_t ino; - struct iatt buf; - struct iatt read_child_buf; - struct iatt preoldparent; - struct iatt prenewparent; - struct iatt postoldparent; - struct iatt postnewparent; - } rename; - - struct { - ino_t ino; - uint64_t gen; - ino_t parent_ino; - inode_t *inode; - struct iatt buf; - struct iatt read_child_buf; - struct iatt preparent; - struct iatt postparent; - } link; - - struct { - ino_t ino; - uint64_t gen; - ino_t parent_ino; - inode_t *inode; dict_t *params; - struct iatt buf; - struct iatt read_child_buf; char *linkpath; - struct iatt preparent; - struct iatt postparent; } symlink; + struct { + int32_t mode; + off_t offset; + size_t len; + } fallocate; + + struct { + off_t offset; + size_t len; + } discard; + struct { - int32_t flags; - dir_entry_t *entries; - int32_t count; - } setdents; + off_t offset; + size_t len; + struct iatt prebuf; + struct iatt postbuf; + } zerofill; + + } cont; struct { off_t start, len; + gf_boolean_t eager_lock_on; int *eager_lock; char *basename; @@ -676,12 +734,19 @@ typedef struct _afr_local { afr_transaction_type type; - int success_count; - int erase_pending; - int failure_count; + /* pre-compute the post piggyback status before + entering POST-OP phase + */ + int *postop_piggybacked; + + /* stub to resume on destruction + of the transaction frame */ + call_stub_t *resume_stub; - int last_tried; - int32_t *child_errno; + struct list_head eager_locked; + + int32_t **txn_changelog;//changelog after pre+post ops + unsigned char *pre_op; call_frame_t *main_frame; @@ -699,6 +764,15 @@ typedef struct _afr_local { afr_self_heal_t self_heal; struct marker_str marker; + + /* extra data for fops */ + dict_t *xdata_req; + dict_t *xdata_rsp; + + mode_t umask; + int xflag; + gf_boolean_t do_discovery; + struct afr_reply *replies; } afr_local_t; typedef enum { @@ -708,11 +782,6 @@ typedef enum { } afr_fd_open_status_t; typedef struct { - struct list_head call_list; - call_frame_t *frame; -} afr_fd_paused_call_t; - -typedef struct { unsigned int *pre_op_done; afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ unsigned int *pre_op_piggyback; @@ -721,7 +790,6 @@ typedef struct { unsigned int *lock_acquired; int flags; - int32_t wbflags; uint64_t up_count; /* number of CHILD_UPs this fd has seen */ uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */ @@ -732,20 +800,32 @@ typedef struct { struct list_head entries; /* needed for readdir failover */ unsigned char *locked_on; /* which subvolumes locks have been successful */ - struct list_head paused_calls; /* queued calls while fix_open happens */ + + /* used for delayed-post-op optimization */ + pthread_mutex_t delay_lock; + gf_timer_t *delay_timer; + call_frame_t *delay_frame; + int call_child; + + /* set if any write on this fd was a non stable write + (i.e, without O_SYNC or O_DSYNC) + */ + gf_boolean_t witnessed_unstable_write; + + /* list of frames currently in progress */ + struct list_head eager_locked; } afr_fd_ctx_t; /* try alloc and if it fails, goto label */ -#define ALLOC_OR_GOTO(var, type, label) do { \ - var = GF_CALLOC (sizeof (type), 1, \ - gf_afr_mt_##type); \ - if (!var) { \ - gf_log (this->name, GF_LOG_ERROR, \ - "out of memory :("); \ - op_errno = ENOMEM; \ - goto label; \ - } \ +#define AFR_LOCAL_ALLOC_OR_GOTO(var, label) do { \ + var = mem_get0 (THIS->local_pool); \ + if (!var) { \ + gf_log (this->name, GF_LOG_ERROR, \ + "out of memory :("); \ + op_errno = ENOMEM; \ + goto label; \ + } \ } while (0); @@ -766,8 +846,14 @@ int pump_command_reply (call_frame_t *frame, xlator_t *this); int32_t -afr_notify (xlator_t *this, int32_t event, - void *data, ...); +afr_notify (xlator_t *this, int32_t event, void *data, void *data2); + +int +afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local, + loc_t *loc, char *basename, int child_count); + +void +afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock); int afr_attempt_lock_recovery (xlator_t *this, int32_t child_index); @@ -780,7 +866,7 @@ afr_mark_locked_nodes (xlator_t *this, fd_t *fd, unsigned char *locked_nodes); void -afr_set_lk_owner (call_frame_t *frame, xlator_t *this); +afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner); int afr_set_lock_number (call_frame_t *frame, xlator_t *this); @@ -804,10 +890,16 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this); int afr_internal_lock_finish (call_frame_t *frame, xlator_t *this); +int +afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, + unsigned int child_count); int pump_start (call_frame_t *frame, xlator_t *this); int +__afr_fd_ctx_set (xlator_t *this, fd_t *fd); + +int afr_fd_ctx_set (xlator_t *this, fd_t *fd); int32_t @@ -817,8 +909,8 @@ void afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, int32_t *fresh_children); -void -afr_build_parent_loc (loc_t *parent, loc_t *child); +int +afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno); unsigned int afr_up_children_count (unsigned char *child_up, unsigned int child_count); @@ -826,6 +918,10 @@ afr_up_children_count (unsigned char *child_up, unsigned int child_count); unsigned int afr_locked_children_count (unsigned char *children, unsigned int child_count); +unsigned int +afr_pre_op_done_children_count (unsigned char *pre_op, + unsigned int child_count); + gf_boolean_t afr_is_fresh_lookup (loc_t *loc, xlator_t *this); @@ -841,15 +937,16 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this); int afr_frame_return (call_frame_t *frame); -uint64_t +gf_boolean_t afr_is_split_brain (xlator_t *this, inode_t *inode); void -afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set); +afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, + afr_spb_state_t data_spb); int afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, int32_t wbflags); + fd_t *fd, dict_t *xdata); void afr_set_opendir_done (xlator_t *this, inode_t *inode); @@ -876,22 +973,27 @@ afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd); frame->local = NULL; \ } \ STACK_UNWIND_STRICT (fop, frame, params); \ - afr_local_cleanup (__local, __this); \ - GF_FREE (__local); \ - } while (0); + if (__local) { \ + afr_local_cleanup (__local, __this); \ + mem_put (__local); \ + } \ + } while (0) -#define AFR_STACK_DESTROY(frame) \ - do { \ - afr_local_t *__local = NULL; \ - xlator_t *__this = NULL; \ - __local = frame->local; \ - __this = frame->this; \ - frame->local = NULL; \ - STACK_DESTROY (frame->root); \ - afr_local_cleanup (__local, __this); \ - GF_FREE (__local); \ +#define AFR_STACK_DESTROY(frame) \ + do { \ + afr_local_t *__local = NULL; \ + xlator_t *__this = NULL; \ + __local = frame->local; \ + __this = frame->this; \ + frame->local = NULL; \ + STACK_DESTROY (frame->root); \ + if (__local) { \ + afr_local_cleanup (__local, __this); \ + mem_put (__local); \ + } \ } while (0); +#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ /* allocate and return a string that is the basename of argument */ static inline char * AFR_BASENAME (const char *str) @@ -915,7 +1017,7 @@ int32_t * afr_children_create (int32_t child_count); int -AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv); +afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno); int afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, @@ -927,12 +1029,13 @@ afr_first_up_child (unsigned char *child_up, size_t child_count); int afr_select_read_child_from_policy (int32_t *fresh_children, int32_t child_count, int32_t prev_read_child, - int32_t config_read_child, int32_t *sources); + int32_t config_read_child, int32_t *sources, + unsigned int hmode, uuid_t gfid); void afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, int32_t *fresh_children, int32_t prev_read_child, - int32_t config_read_child); + int32_t config_read_child, uuid_t gfid); int32_t afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, @@ -954,8 +1057,9 @@ afr_children_rm_child (int32_t *children, int32_t child, int32_t child_count); void afr_reset_children (int32_t *children, int32_t child_count); -gf_boolean_t -afr_error_more_important (int32_t old_errno, int32_t new_errno); +int32_t +afr_most_important_error(int32_t old_errno, int32_t new_errno, + gf_boolean_t eio); int afr_errno_count (int32_t *children, int *child_errno, unsigned int child_count, int32_t op_errno); @@ -974,7 +1078,7 @@ gf_boolean_t afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, unsigned int child_count, const char *path, const char *xlator_name); -int +unsigned int afr_gfid_missing_count (const char *xlator_name, int32_t *children, struct iatt *bufs, unsigned int child_count, const char *path); @@ -988,7 +1092,7 @@ int32_t afr_resultant_errno_get (int32_t *children, int *child_errno, unsigned int child_count); void -afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, int32_t read_child, +afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, int32_t *stale_children); void afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, @@ -996,12 +1100,13 @@ afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this), int (*unwind) (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno)); -int -afr_fix_open (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx, - int need_open_count, int *need_open); -int -afr_open_fd_fix (call_frame_t *frame, xlator_t *this, gf_boolean_t pause_fop); + int32_t op_ret, int32_t op_errno, + int32_t sh_failed)); +void +afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open); + +void +afr_open_fd_fix (fd_t *fd, xlator_t *this); int afr_set_elem_count_get (unsigned char *elems, int child_count); @@ -1016,4 +1121,92 @@ afr_data_self_heal_enabled (char *data_self_heal); void afr_set_low_priority (call_frame_t *frame); +int +afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child, + int flags); + +gf_boolean_t +afr_have_quorum (char *logname, afr_private_t *priv); + +void +afr_matrix_cleanup (int32_t **pending, unsigned int m); + +int32_t** +afr_matrix_create (unsigned int m, unsigned int n); + +gf_boolean_t +afr_is_errno_set (int *child_errno, int child); + +gf_boolean_t +afr_is_errno_unset (int *child_errno, int child); + +gf_boolean_t +afr_is_fd_fixable (fd_t *fd); + +void +afr_prepare_new_entry_pending_matrix (int32_t **pending, + gf_boolean_t (*is_pending) (int *, int), + int *ctx, struct iatt *buf, + unsigned int child_count); +void +afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count); +/* + * Special value indicating we should use the "auto" quorum method instead of + * a fixed value (including zero to turn off quorum enforcement). + */ +#define AFR_QUORUM_AUTO INT_MAX + +/* + * Having this as a macro will make debugging a bit weirder, but does reduce + * the probability of functions handling this check inconsistently. + */ +#define QUORUM_CHECK(_func,_label) do { \ + if (priv->quorum_count && !afr_have_quorum(this->name,priv)) { \ + gf_log(this->name,GF_LOG_WARNING, \ + "failing "#_func" due to lack of quorum"); \ + op_errno = EROFS; \ + goto _label; \ + } \ +} while (0); + + +#define AFR_SBRAIN_MSG "Failed on %s as split-brain is seen. Returning EIO." + +#define AFR_SBRAIN_CHECK_FD(fd, label) do { \ + if (fd->inode && afr_is_split_brain (this, fd->inode)) { \ + op_errno = EIO; \ + gf_log (this->name, GF_LOG_WARNING, \ + AFR_SBRAIN_MSG ,uuid_utoa (fd->inode->gfid)); \ + goto label; \ + } \ +} while (0) + +#define AFR_SBRAIN_CHECK_LOC(loc, label) do { \ + if (loc->inode && afr_is_split_brain (this, loc->inode)) { \ + op_errno = EIO; \ + loc_path (loc, NULL); \ + gf_log (this->name, GF_LOG_WARNING, \ + AFR_SBRAIN_MSG , loc->path); \ + goto label; \ + } \ +} while (0) + +int +afr_fd_report_unstable_write (xlator_t *this, fd_t *fd); + +gf_boolean_t +afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd); + +void +afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub); + +int +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count); + +void +afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this); + +afr_inode_ctx_t* +afr_inode_ctx_get (inode_t *inode, xlator_t *this); + #endif /* __AFR_H__ */ diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c index 0623b817a..a7f72fb30 100644 --- a/xlators/cluster/afr/src/pump.c +++ b/xlators/cluster/afr/src/pump.c @@ -1,25 +1,17 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <unistd.h> #include <sys/time.h> #include <stdlib.h> +#include <fnmatch.h> #ifndef _CONFIG_H #define _CONFIG_H @@ -28,8 +20,16 @@ #include "afr-common.c" #include "defaults.c" +#include "glusterfs.h" static uint64_t pump_pid = 0; +static inline void +pump_fill_loc_info (loc_t *loc, struct iatt *iatt, struct iatt *parent) +{ + afr_update_loc_gfids (loc, iatt, parent); + uuid_copy (loc->inode->gfid, iatt->ia_gfid); +} + static int pump_mark_start_pending (xlator_t *this) { @@ -140,9 +140,7 @@ pump_set_resume_path (xlator_t *this, const char *path) LOCK (&pump_priv->resume_path_lock); { - pump_priv->resume_path = strdup (path); - if (!pump_priv->resume_path) - ret = -1; + strncpy (pump_priv->resume_path, path, strlen (path) + 1); } UNLOCK (&pump_priv->resume_path_lock); @@ -167,25 +165,27 @@ pump_save_path (xlator_t *this, const char *path) GF_ASSERT (priv->root_inode); - afr_build_root_loc (priv->root_inode, &loc); + afr_build_root_loc (this, &loc); dict = dict_new (); dict_ret = dict_set_str (dict, PUMP_PATH, (char *)path); + if (dict_ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set the key %s", path, PUMP_PATH); ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0); if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_INFO, "setxattr failed - could not save path=%s", path); } else { gf_log (this->name, GF_LOG_DEBUG, "setxattr succeeded - saved path=%s", path); - gf_log (this->name, GF_LOG_DEBUG, - "Saving path for status info"); } dict_unref (dict); + loc_wipe (&loc); return 0; } @@ -248,15 +248,9 @@ pump_get_resume_path (xlator_t *this) static int pump_update_resume_state (xlator_t *this, const char *path) { - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - pump_state_t state; const char *resume_path = NULL; - priv = this->private; - pump_priv = priv->pump_private; - state = pump_get_state (); if (state == PUMP_STATE_RESUME) { @@ -284,14 +278,10 @@ pump_update_resume_state (xlator_t *this, const char *path) static gf_boolean_t is_pump_traversal_allowed (xlator_t *this, const char *path) { - afr_private_t *priv = NULL; - pump_state_t state; const char *resume_path = NULL; gf_boolean_t ret = _gf_true; - priv = this->private; - state = pump_get_state (); if (state == PUMP_STATE_RESUME) { @@ -334,21 +324,19 @@ pump_save_file_stats (xlator_t *this, const char *path) static int gf_pump_traverse_directory (loc_t *loc) { - xlator_t *this = NULL; - fd_t *fd = NULL; - - off_t offset = 0; - loc_t entry_loc; - gf_dirent_t *entry = NULL; - gf_dirent_t *tmp = NULL; - gf_dirent_t entries; - - struct iatt iatt, parent; - dict_t *xattr_rsp; - - char *file_path = NULL; - int ret = 0; - gf_boolean_t is_directory_empty = _gf_true; + xlator_t *this = NULL; + fd_t *fd = NULL; + off_t offset = 0; + loc_t entry_loc = {0}; + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + gf_dirent_t entries; + struct iatt iatt = {0}; + struct iatt parent = {0}; + dict_t *xattr_rsp = NULL; + int ret = 0; + gf_boolean_t is_directory_empty = _gf_true; + gf_boolean_t free_entries = _gf_false; INIT_LIST_HEAD (&entries.list); this = THIS; @@ -373,7 +361,8 @@ gf_pump_traverse_directory (loc_t *loc) "pump opendir on %s returned=%d", loc->path, ret); - while (syncop_readdirp (this, fd, 131072, offset, &entries)) { + while (syncop_readdirp (this, fd, 131072, offset, NULL, &entries)) { + free_entries = _gf_true; if (list_empty (&entries.list)) { gf_log (this->name, GF_LOG_TRACE, @@ -385,25 +374,23 @@ gf_pump_traverse_directory (loc_t *loc) gf_log (this->name, GF_LOG_DEBUG, "found readdir entry=%s", entry->d_name); - file_path = afr_build_file_path (loc, entry); - if (!file_path) { - gf_log (this->name, GF_LOG_DEBUG, - "file path construction failed"); - goto out; + offset = entry->d_off; + if (uuid_is_null (entry->d_stat.ia_gfid)) { + gf_log (this->name, GF_LOG_WARNING, "%s/%s: No " + "gfid present skipping", + loc->path, entry->d_name); + continue; } - - afr_build_child_loc (loc, &entry_loc, file_path, - entry->d_name); + loc_wipe (&entry_loc); + ret = afr_build_child_loc (this, &entry_loc, loc, + entry->d_name); + if (ret) + goto out; if (!IS_ENTRY_CWD (entry->d_name) && - !IS_ENTRY_PARENT (entry->d_name)) { + !IS_ENTRY_PARENT (entry->d_name)) { is_directory_empty = _gf_false; - ret = syncop_lookup (this, &entry_loc, NULL, - &iatt, &xattr_rsp, &parent); - - memcpy (entry_loc.inode->gfid, iatt.ia_gfid, 16); - gf_log (this->name, GF_LOG_DEBUG, "lookup %s => %"PRId64, entry_loc.path, @@ -412,12 +399,14 @@ gf_pump_traverse_directory (loc_t *loc) ret = syncop_lookup (this, &entry_loc, NULL, &iatt, &xattr_rsp, &parent); - - gf_log (this->name, GF_LOG_DEBUG, - "second lookup ret=%d: %s => %"PRId64, - ret, - entry_loc.path, - iatt.ia_ino); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: lookup failed", + entry_loc.path); + continue; + } + pump_fill_loc_info (&entry_loc, &iatt, + &parent); pump_update_resume_state (this, entry_loc.path); @@ -431,10 +420,6 @@ gf_pump_traverse_directory (loc_t *loc) goto out; } - gf_log (this->name, GF_LOG_TRACE, - "type of file=%d, IFDIR=%d", - iatt.ia_type, IA_IFDIR); - if (IA_ISDIR (iatt.ia_type)) { if (is_pump_traversal_allowed (this, entry_loc.path)) { gf_log (this->name, GF_LOG_TRACE, @@ -443,18 +428,21 @@ gf_pump_traverse_directory (loc_t *loc) gf_pump_traverse_directory (&entry_loc); } } - } - offset = entry->d_off; - loc_wipe (&entry_loc); + } } gf_dirent_free (&entries); + free_entries = _gf_false; gf_log (this->name, GF_LOG_TRACE, "offset incremented to %d", (int32_t ) offset); } + ret = syncop_close (fd); + if (ret < 0) + gf_log (this->name, GF_LOG_DEBUG, "closing the fd failed"); + if (is_directory_empty && IS_ROOT_PATH (loc->path)) { pump_change_state (this, PUMP_STATE_RUNNING); gf_log (this->name, GF_LOG_INFO, "Empty source brick. " @@ -462,19 +450,18 @@ gf_pump_traverse_directory (loc_t *loc) } out: + if (entry_loc.path) + loc_wipe (&entry_loc); + if (free_entries) + gf_dirent_free (&entries); return 0; - } static int pump_update_resume_path (xlator_t *this) { - afr_private_t *priv = NULL; - const char *resume_path = NULL; - priv = this->private; - resume_path = pump_get_resume_path (this); if (resume_path) { @@ -495,7 +482,7 @@ pump_update_resume_path (xlator_t *this) static int32_t pump_xattr_cleaner (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_private_t *priv = NULL; loc_t loc = {0}; @@ -506,7 +493,7 @@ pump_xattr_cleaner (call_frame_t *frame, void *cookie, xlator_t *this, priv = this->private; - afr_build_root_loc (priv->root_inode, &loc); + afr_build_root_loc (this, &loc); ret = syncop_removexattr (priv->children[source], &loc, PUMP_PATH); @@ -522,6 +509,7 @@ pump_xattr_cleaner (call_frame_t *frame, void *cookie, xlator_t *this, "failed with %s", strerror (errno)); } + loc_wipe (&loc); return pump_command_reply (frame, this); } @@ -541,7 +529,7 @@ pump_complete_migration (xlator_t *this) GF_ASSERT (priv->root_inode); - afr_build_root_loc (priv->root_inode, &loc); + afr_build_root_loc (this, &loc); dict = dict_new (); @@ -553,6 +541,10 @@ pump_complete_migration (xlator_t *this) pump_priv->pump_finished = _gf_true; dict_ret = dict_set_str (dict, PUMP_SOURCE_COMPLETE, "jargon"); + if (dict_ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set the key %s", + loc.path, PUMP_SOURCE_COMPLETE); ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0); if (ret < 0) { @@ -560,6 +552,10 @@ pump_complete_migration (xlator_t *this) "setxattr failed - while notifying source complete"); } dict_ret = dict_set_str (dict, PUMP_SINK_COMPLETE, "jargon"); + if (dict_ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set the key %s", + loc.path, PUMP_SINK_COMPLETE); ret = syncop_setxattr (PUMP_SINK_CHILD (this), &loc, dict, 0); if (ret < 0) { @@ -575,6 +571,7 @@ pump_complete_migration (xlator_t *this) call_resume (pump_priv->cleaner); } + loc_wipe (&loc); return 0; } @@ -630,7 +627,7 @@ pump_task (void *data) GF_ASSERT (priv->root_inode); - afr_build_root_loc (priv->root_inode, &loc); + afr_build_root_loc (this, &loc); xattr_req = dict_new (); if (!xattr_req) { gf_log (this->name, GF_LOG_DEBUG, @@ -644,9 +641,8 @@ pump_task (void *data) &iatt, &xattr_rsp, &parent); gf_log (this->name, GF_LOG_TRACE, - "lookup: ino=%"PRId64", path=%s", - loc.ino, - loc.path); + "lookup: path=%s gfid=%s", + loc.path, uuid_utoa (loc.inode->gfid)); ret = pump_check_and_update_status (this); if (ret < 0) { @@ -669,6 +665,7 @@ out: if (xattr_req) dict_unref (xattr_req); + loc_wipe (&loc); return 0; } @@ -702,7 +699,7 @@ pump_start (call_frame_t *pump_frame, xlator_t *this) priv = this->private; pump_priv = priv->pump_private; - pump_frame->root->lk_owner = (uint64_t) (unsigned long)pump_frame->root; + afr_set_lk_owner (pump_frame, this, pump_frame->root); pump_pid = (uint64_t) (unsigned long)pump_frame->root; ret = synctask_new (pump_priv->env, pump_task, @@ -716,8 +713,8 @@ pump_start (call_frame_t *pump_frame, xlator_t *this) } gf_log (this->name, GF_LOG_DEBUG, - "setting pump as started lk_owner: %"PRIu64" %"PRIu64, - pump_frame->root->lk_owner, pump_pid); + "setting pump as started lk_owner: %s %"PRIu64, + lkowner_utoa (&pump_frame->root->lk_owner), pump_pid); priv->use_afr_in_pump = 1; out: @@ -751,7 +748,7 @@ pump_cmd_start_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) + int32_t op_errno, dict_t *xdata) { call_frame_t *prev = NULL; @@ -803,9 +800,9 @@ pump_initiate_sink_connect (call_frame_t *frame, xlator_t *this) GF_ASSERT (priv->root_inode); - afr_build_root_loc (priv->root_inode, &loc); + afr_build_root_loc (this, &loc); - data = data_ref (dict_get (local->dict, PUMP_CMD_START)); + data = data_ref (dict_get (local->dict, RB_PUMP_CMD_START)); if (!data) { ret = -1; gf_log (this->name, GF_LOG_ERROR, @@ -844,7 +841,7 @@ pump_initiate_sink_connect (call_frame_t *frame, xlator_t *this) PUMP_SINK_CHILD(this)->fops->setxattr, &loc, dict, - 0); + 0, NULL); ret = 0; @@ -858,6 +855,7 @@ out: if (ret && clnt_cmd) GF_FREE (clnt_cmd); + loc_wipe (&loc); return ret; } @@ -877,7 +875,7 @@ pump_cmd_start_getxattr_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict) + dict_t *dict, dict_t *xdata) { afr_local_t *local = NULL; char *path = NULL; @@ -944,6 +942,7 @@ pump_execute_status (call_frame_t *frame, xlator_t *this) uint64_t number_files = 0; char filename[PATH_MAX]; + char summary[PATH_MAX+256]; char *dict_str = NULL; int32_t op_ret = 0; @@ -972,16 +971,19 @@ pump_execute_status (call_frame_t *frame, xlator_t *this) } if (pump_priv->pump_finished) { - snprintf (dict_str, PATH_MAX + 256, "Number of files migrated = %"PRIu64" Migration complete ", - number_files); + snprintf (summary, PATH_MAX+256, + "no_of_files=%"PRIu64, number_files); } else { - snprintf (dict_str, PATH_MAX + 256, "Number of files migrated = %"PRIu64" Current file= %s ", - number_files, filename); + snprintf (summary, PATH_MAX+256, + "no_of_files=%"PRIu64":current_file=%s", + number_files, filename); } + snprintf (dict_str, PATH_MAX+256, "status=%d:%s", + (pump_priv->pump_finished)?1:0, summary); dict = dict_new (); - ret = dict_set_dynstr (dict, PUMP_CMD_STATUS, dict_str); + ret = dict_set_dynstr (dict, RB_PUMP_CMD_STATUS, dict_str); if (ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "dict_set_dynstr returned negative value"); @@ -993,13 +995,12 @@ pump_execute_status (call_frame_t *frame, xlator_t *this) out: - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, NULL); if (dict) dict_unref (dict); - if (dict_str) - GF_FREE (dict_str); + GF_FREE (dict_str); return 0; } @@ -1041,14 +1042,14 @@ pump_execute_start (call_frame_t *frame, xlator_t *this) GF_ASSERT (priv->root_inode); - afr_build_root_loc (priv->root_inode, &loc); + afr_build_root_loc (this, &loc); STACK_WIND (frame, pump_cmd_start_getxattr_cbk, PUMP_SOURCE_CHILD(this), PUMP_SOURCE_CHILD(this)->fops->getxattr, &loc, - PUMP_PATH); + PUMP_PATH, NULL); ret = 0; @@ -1058,6 +1059,7 @@ out: pump_command_reply (frame, this); } + loc_wipe (&loc); return 0; } @@ -1065,7 +1067,7 @@ static int pump_cleanup_helper (void *data) { call_frame_t *frame = data; - pump_xattr_cleaner (frame, 0, frame->this, 0, 0); + pump_xattr_cleaner (frame, 0, frame->this, 0, 0, NULL); return 0; } @@ -1091,14 +1093,6 @@ pump_execute_commit (call_frame_t *frame, xlator_t *this) pump_priv = priv->pump_private; local = frame->local; - - LOCK (&pump_priv->resume_path_lock); - { - pump_priv->number_files_pumped = 0; - pump_priv->current_file[0] = '\0'; - } - UNLOCK (&pump_priv->resume_path_lock); - local->op_ret = 0; if (pump_priv->pump_finished) { pump_change_state (this, PUMP_STATE_COMMIT); @@ -1155,7 +1149,7 @@ pump_execute_abort (call_frame_t *frame, xlator_t *this) } else { pump_priv->cleaner = fop_setxattr_cbk_stub (frame, pump_xattr_cleaner, - 0, 0); + 0, 0, NULL); } return 0; @@ -1168,7 +1162,7 @@ pump_command_status (xlator_t *this, dict_t *dict) int dict_ret = -1; int ret = _gf_true; - dict_ret = dict_get_str (dict, PUMP_CMD_STATUS, &cmd); + dict_ret = dict_get_str (dict, RB_PUMP_CMD_STATUS, &cmd); if (dict_ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "Not a pump status command"); @@ -1192,7 +1186,7 @@ pump_command_pause (xlator_t *this, dict_t *dict) int dict_ret = -1; int ret = _gf_true; - dict_ret = dict_get_str (dict, PUMP_CMD_PAUSE, &cmd); + dict_ret = dict_get_str (dict, RB_PUMP_CMD_PAUSE, &cmd); if (dict_ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "Not a pump pause command"); @@ -1216,7 +1210,7 @@ pump_command_commit (xlator_t *this, dict_t *dict) int dict_ret = -1; int ret = _gf_true; - dict_ret = dict_get_str (dict, PUMP_CMD_COMMIT, &cmd); + dict_ret = dict_get_str (dict, RB_PUMP_CMD_COMMIT, &cmd); if (dict_ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "Not a pump commit command"); @@ -1240,7 +1234,7 @@ pump_command_abort (xlator_t *this, dict_t *dict) int dict_ret = -1; int ret = _gf_true; - dict_ret = dict_get_str (dict, PUMP_CMD_ABORT, &cmd); + dict_ret = dict_get_str (dict, RB_PUMP_CMD_ABORT, &cmd); if (dict_ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "Not a pump abort command"); @@ -1264,7 +1258,7 @@ pump_command_start (xlator_t *this, dict_t *dict) int dict_ret = -1; int ret = _gf_true; - dict_ret = dict_get_str (dict, PUMP_CMD_START, &cmd); + dict_ret = dict_get_str (dict, RB_PUMP_CMD_START, &cmd); if (dict_ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "Not a pump start command"); @@ -1286,7 +1280,7 @@ struct _xattr_key { struct list_head list; }; -static void +static int __gather_xattr_keys (dict_t *dict, char *key, data_t *value, void *data) { @@ -1298,13 +1292,14 @@ __gather_xattr_keys (dict_t *dict, char *key, data_t *value, xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key); if (!xkey) - return; + return -1; xkey->key = key; INIT_LIST_HEAD (&xkey->list); list_add_tail (&xkey->list, list); } + return 0; } static void @@ -1332,7 +1327,7 @@ __filter_xattrs (dict_t *dict) int32_t pump_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict) + dict_t *dict, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -1367,7 +1362,7 @@ pump_getxattr_cbk (call_frame_t *frame, void *cookie, children[next_call_child], children[next_call_child]->fops->getxattr, &local->loc, - local->cont.getxattr.name); + local->cont.getxattr.name, NULL); } out: @@ -1375,7 +1370,7 @@ out: if (op_ret >= 0 && dict) __filter_xattrs (dict); - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, NULL); } return 0; @@ -1383,13 +1378,13 @@ out: int32_t pump_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) + loc_t *loc, const char *name, dict_t *xdata) { afr_private_t * priv = NULL; xlator_t ** children = NULL; int call_child = 0; afr_local_t *local = NULL; - int32_t op_ret = -1; + int32_t ret = -1; int32_t op_errno = 0; uint64_t read_child = 0; @@ -1402,15 +1397,21 @@ pump_getxattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (priv->children, out); children = priv->children; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, default_getxattr_cbk, + FIRST_CHILD (this), + (FIRST_CHILD (this))->fops->getxattr, + loc, name, xdata); + return 0; + } - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; - op_ret = AFR_LOCAL_INIT (local, priv); - if (op_ret < 0) { - op_errno = -op_ret; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } if (name) { if (!strncmp (name, AFR_XATTR_PREFIX, @@ -1420,39 +1421,31 @@ pump_getxattr (call_frame_t *frame, xlator_t *this, goto out; } - if (!strcmp (name, PUMP_CMD_STATUS)) { + if (!strcmp (name, RB_PUMP_CMD_STATUS)) { gf_log (this->name, GF_LOG_DEBUG, "Hit pump command - status"); pump_execute_status (frame, this); - op_ret = 0; + ret = 0; goto out; } } - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, default_getxattr_cbk, - FIRST_CHILD (this), - (FIRST_CHILD (this))->fops->getxattr, - loc, name); - return 0; - } - local->fresh_children = GF_CALLOC (priv->child_count, sizeof (*local->fresh_children), gf_afr_mt_int32_t); - if (local->fresh_children) { + if (!local->fresh_children) { + ret = -1; op_errno = ENOMEM; goto out; } read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, + ret = afr_get_call_child (this, local->child_up, read_child, local->fresh_children, &call_child, &local->cont.getxattr.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; + if (ret < 0) { + op_errno = -ret; goto out; } loc_copy (&local->loc, loc); @@ -1462,13 +1455,12 @@ pump_getxattr (call_frame_t *frame, xlator_t *this, STACK_WIND_COOKIE (frame, pump_getxattr_cbk, (void *) (long) call_child, children[call_child], children[call_child]->fops->getxattr, - loc, name); + loc, name, xdata); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); return 0; } @@ -1490,14 +1482,14 @@ afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (setxattr, main_frame, - local->op_ret, local->op_errno) + local->op_ret, local->op_errno, NULL); } return 0; } static int afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t * local = NULL; afr_private_t * priv = NULL; @@ -1566,7 +1558,7 @@ afr_setxattr_wind (call_frame_t *frame, xlator_t *this) priv->children[i]->fops->setxattr, &local->loc, local->cont.setxattr.dict, - local->cont.setxattr.flags); + local->cont.setxattr.flags, NULL); if (!--call_count) break; @@ -1594,11 +1586,9 @@ pump_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) + int32_t op_errno, dict_t *xdata) { - STACK_UNWIND (frame, - op_ret, - op_errno); + AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); return 0; } @@ -1616,12 +1606,10 @@ pump_command_reply (call_frame_t *frame, xlator_t *this) gf_log (this->name, GF_LOG_INFO, "Command succeeded"); - dict_unref (local->dict); - AFR_STACK_UNWIND (setxattr, frame, local->op_ret, - local->op_errno); + local->op_errno, NULL); return 0; } @@ -1658,50 +1646,53 @@ pump_parse_command (call_frame_t *frame, xlator_t *this, int pump_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags) + loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; call_frame_t *transaction_frame = NULL; - int ret = -1; - - int op_ret = -1; int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (this->private, out); + GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.pump*", dict, + op_errno, out); + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, default_setxattr_cbk, + FIRST_CHILD (this), + (FIRST_CHILD (this))->fops->setxattr, + loc, dict, flags, xdata); + return 0; + } + - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (local, out); - ret = AFR_LOCAL_INIT (local, priv); + ret = afr_local_init (local, priv, &op_errno); if (ret < 0) { - op_errno = -ret; + afr_local_cleanup (local, this); goto out; - } + } ret = pump_parse_command (frame, this, local, dict); if (ret >= 0) { - op_ret = 0; + ret = 0; goto out; } - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, default_setxattr_cbk, - FIRST_CHILD (this), - (FIRST_CHILD (this))->fops->setxattr, - loc, dict, flags); - return 0; - } - transaction_frame = copy_frame (frame); if (!transaction_frame) { gf_log (this->name, GF_LOG_ERROR, "Out of memory."); + op_errno = ENOMEM; + ret = -1; + afr_local_cleanup (local, this); goto out; } @@ -1724,12 +1715,12 @@ pump_setxattr (call_frame_t *frame, xlator_t *this, afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno); + AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); } return 0; @@ -1763,7 +1754,7 @@ static int32_t pump_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, - off_t offset) + off_t offset, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1773,11 +1764,11 @@ pump_truncate (call_frame_t *frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate, loc, - offset); + offset, xdata); return 0; } - afr_truncate (frame, this, loc, offset); + afr_truncate (frame, this, loc, offset, xdata); return 0; } @@ -1786,7 +1777,7 @@ static int32_t pump_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, - off_t offset) + off_t offset, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1796,11 +1787,11 @@ pump_ftruncate (call_frame_t *frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->ftruncate, fd, - offset); + offset, xdata); return 0; } - afr_ftruncate (frame, this, fd, offset); + afr_ftruncate (frame, this, fd, offset, xdata); return 0; } @@ -1809,7 +1800,7 @@ pump_ftruncate (call_frame_t *frame, int pump_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, dict_t *parms) + loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1817,10 +1808,10 @@ pump_mknod (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, default_mknod_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, - loc, mode, rdev, parms); + loc, mode, rdev, umask, xdata); return 0; } - afr_mknod (frame, this, loc, mode, rdev, parms); + afr_mknod (frame, this, loc, mode, rdev, umask, xdata); return 0; } @@ -1829,7 +1820,7 @@ pump_mknod (call_frame_t *frame, xlator_t *this, int pump_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *params) + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1837,10 +1828,10 @@ pump_mkdir (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, default_mkdir_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, - loc, mode, params); + loc, mode, umask, xdata); return 0; } - afr_mkdir (frame, this, loc, mode, params); + afr_mkdir (frame, this, loc, mode, umask, xdata); return 0; } @@ -1849,7 +1840,7 @@ pump_mkdir (call_frame_t *frame, xlator_t *this, static int32_t pump_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc) + loc_t *loc, int xflag, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1858,10 +1849,10 @@ pump_unlink (call_frame_t *frame, default_unlink_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, - loc); + loc, xflag, xdata); return 0; } - afr_unlink (frame, this, loc); + afr_unlink (frame, this, loc, xflag, xdata); return 0; } @@ -1869,7 +1860,7 @@ pump_unlink (call_frame_t *frame, static int pump_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags) + loc_t *loc, int flags, dict_t *xdata) { afr_private_t *priv = NULL; @@ -1879,11 +1870,11 @@ pump_rmdir (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, default_rmdir_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rmdir, - loc, flags); + loc, flags, xdata); return 0; } - afr_rmdir (frame, this, loc, flags); + afr_rmdir (frame, this, loc, flags, xdata); return 0; } @@ -1892,7 +1883,7 @@ pump_rmdir (call_frame_t *frame, xlator_t *this, int pump_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *loc, dict_t *params) + const char *linkpath, loc_t *loc, mode_t umask, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1900,10 +1891,10 @@ pump_symlink (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, default_symlink_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink, - linkpath, loc, params); + linkpath, loc, umask, xdata); return 0; } - afr_symlink (frame, this, linkpath, loc, params); + afr_symlink (frame, this, linkpath, loc, umask, xdata); return 0; } @@ -1913,7 +1904,7 @@ static int32_t pump_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc) + loc_t *newloc, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1922,10 +1913,10 @@ pump_rename (call_frame_t *frame, default_rename_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, - oldloc, newloc); + oldloc, newloc, xdata); return 0; } - afr_rename (frame, this, oldloc, newloc); + afr_rename (frame, this, oldloc, newloc, xdata); return 0; } @@ -1935,7 +1926,7 @@ static int32_t pump_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc) + loc_t *newloc, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1944,10 +1935,10 @@ pump_link (call_frame_t *frame, default_link_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, - oldloc, newloc); + oldloc, newloc, xdata); return 0; } - afr_link (frame, this, oldloc, newloc); + afr_link (frame, this, oldloc, newloc, xdata); return 0; } @@ -1956,7 +1947,7 @@ pump_link (call_frame_t *frame, static int32_t pump_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) + mode_t umask, fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1964,10 +1955,10 @@ pump_create (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, default_create_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->create, - loc, flags, mode, fd, params); + loc, flags, mode, umask, fd, xdata); return 0; } - afr_create (frame, this, loc, flags, mode, fd, params); + afr_create (frame, this, loc, flags, mode, umask, fd, xdata); return 0; } @@ -1977,8 +1968,7 @@ static int32_t pump_open (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flags, fd_t *fd, - int32_t wbflags) + int32_t flags, fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1987,10 +1977,10 @@ pump_open (call_frame_t *frame, default_open_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->open, - loc, flags, fd, wbflags); + loc, flags, fd, xdata); return 0; } - afr_open (frame, this, loc, flags, fd, wbflags); + afr_open (frame, this, loc, flags, fd, xdata); return 0; } @@ -2002,8 +1992,8 @@ pump_writev (call_frame_t *frame, fd_t *fd, struct iovec *vector, int32_t count, - off_t off, - struct iobref *iobref) + off_t off, uint32_t flags, + struct iobref *iobref, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2015,20 +2005,20 @@ pump_writev (call_frame_t *frame, fd, vector, count, - off, - iobref); + off, flags, + iobref, xdata); return 0; } - afr_writev (frame, this, fd, vector, count, off, iobref); - return 0; + afr_writev (frame, this, fd, vector, count, off, flags, iobref, xdata); + return 0; } static int32_t pump_flush (call_frame_t *frame, xlator_t *this, - fd_t *fd) + fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2037,10 +2027,10 @@ pump_flush (call_frame_t *frame, default_flush_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->flush, - fd); + fd, xdata); return 0; } - afr_flush (frame, this, fd); + afr_flush (frame, this, fd, xdata); return 0; } @@ -2050,7 +2040,7 @@ static int32_t pump_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t flags) + int32_t flags, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2060,10 +2050,10 @@ pump_fsync (call_frame_t *frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, fd, - flags); + flags, xdata); return 0; } - afr_fsync (frame, this, fd, flags); + afr_fsync (frame, this, fd, flags, xdata); return 0; } @@ -2072,7 +2062,7 @@ pump_fsync (call_frame_t *frame, static int32_t pump_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd) + loc_t *loc, fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2081,10 +2071,10 @@ pump_opendir (call_frame_t *frame, default_opendir_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->opendir, - loc, fd); + loc, fd, xdata); return 0; } - afr_opendir (frame, this, loc, fd); + afr_opendir (frame, this, loc, fd, xdata); return 0; } @@ -2094,7 +2084,7 @@ static int32_t pump_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t flags) + int32_t flags, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2104,10 +2094,10 @@ pump_fsyncdir (call_frame_t *frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsyncdir, fd, - flags); + flags, xdata); return 0; } - afr_fsyncdir (frame, this, fd, flags); + afr_fsyncdir (frame, this, fd, flags, xdata); return 0; } @@ -2118,7 +2108,7 @@ pump_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, gf_xattrop_flags_t flags, - dict_t *dict) + dict_t *dict, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2129,10 +2119,10 @@ pump_xattrop (call_frame_t *frame, FIRST_CHILD(this)->fops->xattrop, loc, flags, - dict); + dict, xdata); return 0; } - afr_xattrop (frame, this, loc, flags, dict); + afr_xattrop (frame, this, loc, flags, dict, xdata); return 0; } @@ -2142,7 +2132,7 @@ pump_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, gf_xattrop_flags_t flags, - dict_t *dict) + dict_t *dict, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2153,10 +2143,10 @@ pump_fxattrop (call_frame_t *frame, FIRST_CHILD(this)->fops->fxattrop, fd, flags, - dict); + dict, xdata); return 0; } - afr_fxattrop (frame, this, fd, flags, dict); + afr_fxattrop (frame, this, fd, flags, dict, xdata); return 0; } @@ -2166,9 +2156,17 @@ static int32_t pump_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name) + const char *name, dict_t *xdata) { - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO (this, out); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.pump*", + name, op_errno, out); + + op_errno = 0; priv = this->private; if (!priv->use_afr_in_pump) { STACK_WIND (frame, @@ -2176,10 +2174,14 @@ pump_removexattr (call_frame_t *frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->removexattr, loc, - name); + name, xdata); return 0; } - afr_removexattr (frame, this, loc, name); + afr_removexattr (frame, this, loc, name, xdata); + + out: + if (op_errno) + AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); return 0; } @@ -2191,7 +2193,7 @@ pump_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t off) + off_t off, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2200,21 +2202,18 @@ pump_readdir (call_frame_t *frame, default_readdir_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdir, - fd, size, off); + fd, size, off, xdata); return 0; } - afr_readdir (frame, this, fd, size, off); + afr_readdir (frame, this, fd, size, off, xdata); return 0; } static int32_t -pump_readdirp (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t off) +pump_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t off, dict_t *dict) { afr_private_t *priv = NULL; priv = this->private; @@ -2223,10 +2222,10 @@ pump_readdirp (call_frame_t *frame, default_readdirp_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp, - fd, size, off); + fd, size, off, dict); return 0; } - afr_readdirp (frame, this, fd, size, off); + afr_readdirp (frame, this, fd, size, off, dict); return 0; } @@ -2257,13 +2256,24 @@ pump_release (xlator_t *this, } +static int32_t +pump_forget (xlator_t *this, inode_t *inode) +{ + afr_private_t *priv = NULL; + + priv = this->private; + if (priv->use_afr_in_pump) + afr_forget (this, inode); + + return 0; +} static int32_t pump_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, - int32_t valid) + int32_t valid, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2272,10 +2282,10 @@ pump_setattr (call_frame_t *frame, default_setattr_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->setattr, - loc, stbuf, valid); + loc, stbuf, valid, xdata); return 0; } - afr_setattr (frame, this, loc, stbuf, valid); + afr_setattr (frame, this, loc, stbuf, valid, xdata); return 0; } @@ -2286,7 +2296,7 @@ pump_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, - int32_t valid) + int32_t valid, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2295,10 +2305,10 @@ pump_fsetattr (call_frame_t *frame, default_fsetattr_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetattr, - fd, stbuf, valid); + fd, stbuf, valid, xdata); return 0; } - afr_fsetattr (frame, this, fd, stbuf, valid); + afr_fsetattr (frame, this, fd, stbuf, valid, xdata); return 0; } @@ -2347,7 +2357,7 @@ notify (xlator_t *this, int32_t event, child_xl = (xlator_t *) data; - ret = afr_notify (this, event, data); + ret = afr_notify (this, event, data, NULL); switch (event) { case GF_EVENT_CHILD_DOWN: @@ -2382,7 +2392,7 @@ init (xlator_t *this) xlator_list_t * trav = NULL; int i = 0; int ret = -1; - int op_errno = 0; + GF_UNUSED int op_errno = 0; int source_child = 0; @@ -2398,9 +2408,26 @@ init (xlator_t *this) "Volume is dangling."); } - ALLOC_OR_GOTO (this->private, afr_private_t, out); + this->private = GF_CALLOC (1, sizeof (afr_private_t), + gf_afr_mt_afr_private_t); + if (!this->private) + goto out; priv = this->private; + LOCK_INIT (&priv->lock); + LOCK_INIT (&priv->read_child_lock); + //lock recovery is not done in afr + pthread_mutex_init (&priv->mutex, NULL); + INIT_LIST_HEAD (&priv->saved_fds); + + child_count = xlator_subvolume_count (this); + if (child_count != 2) { + gf_log (this->name, GF_LOG_ERROR, + "There should be exactly 2 children - one source " + "and one sink"); + return -1; + } + priv->child_count = child_count; priv->read_child = source_child; priv->favorite_child = source_child; @@ -2410,14 +2437,13 @@ init (xlator_t *this) priv->metadata_self_heal = 1; priv->entry_self_heal = 1; - priv->data_self_heal_algorithm = ""; - priv->data_self_heal_window_size = 16; priv->data_change_log = 1; priv->metadata_change_log = 1; priv->entry_change_log = 1; priv->use_afr_in_pump = 1; + priv->sh_readdir_size = 65536; /* Locking options */ @@ -2426,31 +2452,9 @@ init (xlator_t *this) and the sink. */ - priv->data_lock_server_count = 2; - priv->metadata_lock_server_count = 2; - priv->entry_lock_server_count = 2; - priv->strict_readdir = _gf_false; - trav = this->children; - while (trav) { - child_count++; - trav = trav->next; - } - priv->wait_count = 1; - - if (child_count != 2) { - gf_log (this->name, GF_LOG_ERROR, - "There should be exactly 2 children - one source " - "and one sink"); - return -1; - } - priv->child_count = child_count; - - LOCK_INIT (&priv->lock); - LOCK_INIT (&priv->read_child_lock); - priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, gf_afr_mt_char); if (!priv->child_up) { @@ -2497,6 +2501,12 @@ init (xlator_t *this) i++; } + ret = gf_asprintf (&priv->sh_domain, "%s-self-heal", this->name); + if (-1 == ret) { + op_errno = ENOMEM; + goto out; + } + priv->first_lookup = 1; priv->root_inode = NULL; @@ -2528,7 +2538,7 @@ init (xlator_t *this) goto out; } - pump_priv->env = syncenv_new (0); + pump_priv->env = this->ctx->env; if (!pump_priv->env) { gf_log (this->name, GF_LOG_ERROR, "Could not create new sync-environment"); @@ -2536,10 +2546,16 @@ init (xlator_t *this) goto out; } - priv->pump_private = pump_priv; + /* keep more local here as we may need them for self-heal etc */ + this->local_pool = mem_pool_new (afr_local_t, 128); + if (!this->local_pool) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + goto out; + } - pthread_mutex_init (&priv->mutex, NULL); - INIT_LIST_HEAD (&priv->saved_fds); + priv->pump_private = pump_priv; pump_change_state (this, PUMP_STATE_ABORT); @@ -2551,6 +2567,25 @@ out: int fini (xlator_t *this) { + afr_private_t * priv = NULL; + pump_private_t *pump_priv = NULL; + + priv = this->private; + this->private = NULL; + if (!priv) + goto out; + + pump_priv = priv->pump_private; + if (!pump_priv) + goto afr_priv; + + GF_FREE (pump_priv->resume_path); + LOCK_DESTROY (&pump_priv->resume_path_lock); + LOCK_DESTROY (&pump_priv->pump_state_lock); + GF_FREE (pump_priv); +afr_priv: + afr_priv_destroy (priv); +out: return 0; } @@ -2598,6 +2633,7 @@ struct xlator_dumpops dumpops = { struct xlator_cbks cbks = { .release = pump_release, .releasedir = pump_releasedir, + .forget = pump_forget, }; struct volume_options options[] = { diff --git a/xlators/cluster/afr/src/pump.h b/xlators/cluster/afr/src/pump.h index 02eede49c..bc4c31a78 100644 --- a/xlators/cluster/afr/src/pump.h +++ b/xlators/cluster/afr/src/pump.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __PUMP_H__ @@ -26,12 +17,6 @@ #define CLIENT_CMD_CONNECT "trusted.glusterfs.client-connect" #define CLIENT_CMD_DISCONNECT "trusted.glusterfs.client-disconnect" -#define PUMP_CMD_START "trusted.glusterfs.pump.start" -#define PUMP_CMD_COMMIT "trusted.glusterfs.pump.commit" -#define PUMP_CMD_ABORT "trusted.glusterfs.pump.abort" -#define PUMP_CMD_PAUSE "trusted.glusterfs.pump.pause" -#define PUMP_CMD_STATUS "trusted.glusterfs.pump.status" - #define PUMP_SOURCE_COMPLETE "trusted.glusterfs.pump-source-complete" #define PUMP_SINK_COMPLETE "trusted.glusterfs.pump-sink-complete" @@ -50,7 +35,7 @@ typedef enum { typedef struct _pump_private { struct syncenv *env; /* The env pointer to the pump synctask */ - const char *resume_path; /* path to resume from the last pause */ + char *resume_path; /* path to resume from the last pause */ gf_lock_t resume_path_lock; /* Synchronize resume_path changes */ gf_lock_t pump_state_lock; /* Synchronize pump_state changes */ pump_state_t pump_state; /* State of pump */ diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am index e35058d65..174bea841 100644 --- a/xlators/cluster/dht/src/Makefile.am +++ b/xlators/cluster/dht/src/Makefile.am @@ -4,7 +4,7 @@ xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c dht-rebalance.c \ dht-selfheal.c dht-rename.c dht-hashfn.c dht-diskusage.c \ - dht-common.c dht-inode-write.c dht-inode-read.c \ + dht-common.c dht-inode-write.c dht-inode-read.c dht-shared.c \ $(top_builddir)/xlators/lib/src/libxlator.c dht_la_SOURCES = $(dht_common_source) dht.c @@ -12,22 +12,23 @@ dht_la_SOURCES = $(dht_common_source) dht.c nufa_la_SOURCES = $(dht_common_source) nufa.c switch_la_SOURCES = $(dht_common_source) switch.c -dht_la_LDFLAGS = -module -avoidversion +dht_la_LDFLAGS = -module -avoid-version dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -nufa_la_LDFLAGS = -module -avoidversion +nufa_la_LDFLAGS = -module -avoid-version nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -switch_la_LDFLAGS = -module -avoidversion +switch_la_LDFLAGS = -module -avoid-version switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la noinst_HEADERS = dht-common.h dht-mem-types.h \ $(top_builddir)/xlators/lib/src/libxlator.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) \ +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ -I$(top_srcdir)/xlators/lib/src +AM_CFLAGS = -Wall $(GF_CFLAGS) + CLEANFILES = uninstall-local: diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index e221e10ab..8f61339e6 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2009-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -31,17 +22,18 @@ #include "dht-common.h" #include "defaults.h" #include "byte-order.h" +#include "glusterfs-acl.h" #include <sys/time.h> #include <libgen.h> -void +int dht_aggregate (dict_t *this, char *key, data_t *value, void *data) { dict_t *dst = NULL; int64_t *ptr = 0, *size = NULL; int32_t ret = -1; - data_pair_t *data_pair = NULL; + data_t *dict_data = NULL; dst = data; @@ -53,32 +45,37 @@ dht_aggregate (dict_t *this, char *key, data_t *value, void *data) if (size == NULL) { gf_log ("dht", GF_LOG_WARNING, "memory allocation failed"); - return; + return -1; } ret = dict_set_bin (dst, key, size, sizeof (int64_t)); if (ret < 0) { gf_log ("dht", GF_LOG_WARNING, "dht aggregate dict set failed"); GF_FREE (size); - return; + return -1; } } ptr = data_to_bin (value); if (ptr == NULL) { gf_log ("dht", GF_LOG_WARNING, "data to bin failed"); - return; + return -1; } *size = hton64 (ntoh64 (*size) + ntoh64 (*ptr)); + + } else if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) { + ret = gf_get_min_stime (THIS, dst, key, value); + if (ret < 0) + return ret; } else { /* compare user xattrs only */ if (!strncmp (key, "user.", strlen ("user."))) { - ret = dict_lookup (dst, key, &data_pair); - if (!ret && data) { - ret = is_data_equal (data_pair->value, value); + ret = dict_lookup (dst, key, &dict_data); + if (!ret && dict_data && value) { + ret = is_data_equal (dict_data, value); if (!ret) - gf_log ("dht", GF_LOG_WARNING, + gf_log ("dht", GF_LOG_DEBUG, "xattr mismatch for %s", key); } } @@ -87,7 +84,7 @@ dht_aggregate (dict_t *this, char *key, data_t *value, void *data) gf_log ("dht", GF_LOG_WARNING, "xattr dict set failed"); } - return; + return 0; } @@ -114,7 +111,7 @@ out: int dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { dht_local_t *local = NULL; dht_layout_t *layout = NULL; @@ -134,7 +131,12 @@ dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie, ret = dht_layout_set (this, local->inode, layout); } - WIPE (&local->postparent); + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } + + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); DHT_STACK_UNWIND (lookup, frame, ret, local->op_errno, local->inode, &local->stbuf, local->xattr, &local->postparent); @@ -145,6 +147,256 @@ out: int +dht_discover_complete (xlator_t *this, call_frame_t *discover_frame) +{ + dht_local_t *local = NULL; + call_frame_t *main_frame = NULL; + int op_errno = 0; + int ret = -1; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + + local = discover_frame->local; + layout = local->layout; + conf = this->private; + + LOCK(&discover_frame->lock); + { + main_frame = local->main_frame; + local->main_frame = NULL; + } + UNLOCK(&discover_frame->lock); + + if (!main_frame) + return 0; + + if (local->file_count && local->dir_count) { + gf_log (this->name, GF_LOG_ERROR, + "path %s exists as a file on one subvolume " + "and directory on another. " + "Please fix it manually", + local->loc.path); + op_errno = EIO; + goto out; + } + + if (local->cached_subvol) { + ret = dht_layout_preset (this, local->cached_subvol, + local->inode); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set layout for subvolume %s", + local->cached_subvol ? local->cached_subvol->name : "<nil>"); + op_errno = EINVAL; + goto out; + } + } else { + ret = dht_layout_normalize (this, &local->loc, layout); + if ((ret < 0) || ((ret > 0) && (local->op_ret != 0))) { + /* either the layout is incorrect or the directory is + * not found even in one subvolume. + */ + gf_log (this->name, GF_LOG_DEBUG, + "normalizing failed on %s " + "(overlaps/holes present: %s, " + "ENOENT errors: %d)", local->loc.path, + (ret < 0) ? "yes" : "no", (ret > 0) ? ret : 0); + if ((ret > 0) && (ret == conf->subvolume_cnt)) { + op_errno = ESTALE; + goto out; + } + } + + if (local->inode) + dht_layout_set (this, local->inode, layout); + } + + DHT_STACK_UNWIND (lookup, main_frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); + return 0; +out: + DHT_STACK_UNWIND (lookup, main_frame, -1, op_errno, NULL, NULL, NULL, + NULL); + + return ret; +} + + +int +dht_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = -1; + int is_dir = 0; + int is_linkfile = 0; + int attempt_unwind = 0; + dht_conf_t *conf = 0; + + GF_VALIDATE_OR_GOTO ("dht", frame, out); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", this->private, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + conf = this->private; + + layout = local->layout; + + /* Check if the gfid is different for file from other node */ + if (!op_ret && uuid_compare (local->gfid, stbuf->ia_gfid)) { + gf_log (this->name, GF_LOG_WARNING, + "%s: gfid different on %s", + local->loc.path, prev->this->name); + } + + + LOCK (&frame->lock); + { + /* TODO: assert equal mode on stbuf->st_mode and + local->stbuf->st_mode + + else mkdir/chmod/chown and fix + */ + ret = dht_layout_merge (this, layout, prev->this, + op_ret, op_errno, xattr); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to merge layouts", local->loc.path); + + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_DEBUG, + "lookup of %s on %s returned error (%s)", + local->loc.path, prev->this->name, + strerror (op_errno)); + + goto unlock; + } + + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); + is_dir = check_is_dir (inode, stbuf, xattr); + + if (is_dir) { + local->dir_count ++; + } else { + local->file_count ++; + + if (!is_linkfile) { + /* real file */ + local->cached_subvol = prev->this; + attempt_unwind = 1; + } else { + goto unlock; + } + } + + local->op_ret = 0; + + if (local->xattr == NULL) { + local->xattr = dict_ref (xattr); + } else { + dht_aggregate_xattr (local->xattr, xattr); + } + + if (local->inode == NULL) + local->inode = inode_ref (inode); + + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); + dht_iatt_merge (this, &local->postparent, postparent, + prev->this); + } +unlock: + UNLOCK (&frame->lock); +out: + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt) || attempt_unwind) { + dht_discover_complete (this, frame); + } + + if (is_last_call (this_call_cnt)) + DHT_STACK_DESTROY (frame); + + return 0; +} + + +int +dht_discover (call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + int ret; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int call_cnt = 0; + int op_errno = EINVAL; + int i = 0; + call_frame_t *discover_frame = NULL; + + conf = this->private; + local = frame->local; + + ret = dict_set_uint32 (local->xattr_req, conf->xattr_name, 4 * 4); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set '%s' key", + loc->path, conf->xattr_name); + + ret = dict_set_uint32 (local->xattr_req, conf->link_xattr_name, 256); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set '%s' key", + loc->path, conf->link_xattr_name); + + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + local->layout = dht_layout_new (this, conf->subvolume_cnt); + + if (!local->layout) { + op_errno = ENOMEM; + goto err; + } + + uuid_copy (local->gfid, loc->gfid); + + discover_frame = copy_frame (frame); + if (!discover_frame) { + op_errno = ENOMEM; + goto err; + } + + discover_frame->local = local; + frame->local = NULL; + local->main_frame = frame; + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (discover_frame, dht_discover_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + &local->loc, local->xattr_req); + } + + return 0; + +err: + DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, + NULL); + + return 0; +} + + +int dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, dict_t *xattr, @@ -189,7 +441,7 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, op_ret, op_errno, xattr); if (op_ret == -1) { - local->op_errno = ENOENT; + local->op_errno = op_errno; gf_log (this->name, GF_LOG_DEBUG, "lookup of %s on %s returned error (%s)", local->loc.path, prev->this->name, @@ -248,6 +500,12 @@ unlock: dht_layout_set (this, local->inode, layout); } + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } + + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, local->xattr, &local->postparent); @@ -257,6 +515,7 @@ unlock: selfheal: FRAME_SU_DO (frame, dht_local_t); + uuid_copy (local->loc.gfid, local->gfid); ret = dht_selfheal_directory (frame, dht_lookup_selfheal_cbk, &local->loc, layout); out: @@ -277,6 +536,8 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int ret = -1; int is_dir = 0; int is_linkfile = 0; + call_frame_t *copy = NULL; + dht_local_t *copy_local = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, err); @@ -303,7 +564,7 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, strerror (op_errno)); } if (op_errno == ESTALE) { - /* propogate the ESTALE to parent. + /* propagate the ESTALE to parent. * setting local->return_estale would send * ESTALE to parent. */ local->return_estale = 1; @@ -335,7 +596,8 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, layout = local->layout; is_dir = check_is_dir (inode, stbuf, xattr); - is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); if (is_linkfile) { gf_log (this->name, GF_LOG_INFO, @@ -347,6 +609,23 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } if (is_dir) { + ret = dht_dir_has_layout (xattr, conf->xattr_name); + if (ret >= 0) { + if (is_greater_time(local->stbuf.ia_ctime, + local->stbuf.ia_ctime_nsec, + stbuf->ia_ctime, + stbuf->ia_ctime_nsec)) { + local->prebuf.ia_gid = stbuf->ia_gid; + local->prebuf.ia_uid = stbuf->ia_uid; + } + } + if (local->stbuf.ia_type != IA_INVAL) + { + if ((local->stbuf.ia_gid != stbuf->ia_gid) || + (local->stbuf.ia_uid != stbuf->ia_uid)) { + local->need_selfheal = 1; + } + } ret = dht_layout_dir_mismatch (this, layout, prev->this, &local->loc, xattr); @@ -385,7 +664,28 @@ out: && (conf && conf->unhashed_sticky_bit)) { local->stbuf.ia_prot.sticky = 1; } - if (local->layout_mismatch) { + if (local->need_selfheal) { + local->need_selfheal = 0; + uuid_copy (local->gfid, local->stbuf.ia_gfid); + local->stbuf.ia_gid = local->prebuf.ia_gid; + local->stbuf.ia_uid = local->prebuf.ia_uid; + copy = create_frame (this, this->ctx->pool); + if (copy) { + copy_local = dht_local_init (copy, &local->loc, + NULL, 0); + if (!copy_local) + goto cont; + copy_local->stbuf = local->stbuf; + copy->local = copy_local; + FRAME_SU_DO (copy, dht_local_t); + ret = synctask_new (this->ctx->env, + dht_dir_attr_heal, + dht_dir_attr_heal_done, + copy, copy); + } + } +cont: + if (local->layout_mismatch) { /* Found layout mismatch in the directory, need to fix this in the inode context */ dht_layout_unref (this, local->layout); @@ -411,8 +711,12 @@ out: local->op_errno = ESTALE; } - WIPE (&local->postparent); + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, local->xattr, &local->postparent); @@ -428,7 +732,8 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; xlator_t *cached_subvol = NULL; @@ -461,9 +766,16 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie, local->stbuf.ia_prot.sticky = 1; } + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); + } + unwind: - WIPE (&local->postparent); + if (local->linked == _gf_true) + dht_linkfile_attr_heal (frame, this); + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, local->xattr, &local->postparent); @@ -534,8 +846,12 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this) "<nil>")); } - WIPE (&local->postparent); + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, local->xattr, @@ -563,8 +879,12 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this) local->op_errno = EINVAL; } - WIPE (&local->postparent); + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, local->xattr, @@ -578,7 +898,7 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this) hashed_subvol->name); ret = dht_linkfile_create (frame, - dht_lookup_linkfile_create_cbk, + dht_lookup_linkfile_create_cbk, this, cached_subvol, hashed_subvol, &local->loc); return ret; @@ -588,7 +908,8 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this) int dht_lookup_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { int this_call_cnt = 0; @@ -615,8 +936,9 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, xlator_t *subvol = NULL; loc_t *loc = NULL; xlator_t *link_subvol = NULL; - int ret = -1; - int32_t fd_count = 0; + int ret = -1; + int32_t fd_count = 0; + dht_conf_t *conf = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, out); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -626,6 +948,7 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; loc = &local->loc; + conf = this->private; prev = cookie; subvol = prev->this; @@ -647,7 +970,8 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, loc->path, prev->this->name); } - is_linkfile = check_is_linkfile (inode, buf, xattr); + is_linkfile = check_is_linkfile (inode, buf, xattr, + conf->link_xattr_name); is_dir = check_is_dir (inode, buf, xattr); if (is_linkfile) { @@ -688,7 +1012,7 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, /* This is where we need 'rename' both entries logic */ gf_log (this->name, GF_LOG_WARNING, "multiple subvolumes (%s and %s) have " - "file %s (preferrably rename the file " + "file %s (preferably rename the file " "in the backend, and do a fresh lookup)", local->cached_subvol->name, subvol->name, local->loc.path); @@ -707,7 +1031,7 @@ unlock: "deleting stale linkfile %s on %s", loc->path, subvol->name); STACK_WIND (frame, dht_lookup_unlink_cbk, - subvol, subvol->fops->unlink, loc); + subvol, subvol->fops->unlink, loc, 0, NULL); return 0; } } @@ -789,7 +1113,16 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, gf_log (this->name, GF_LOG_INFO, "lookup of %s on %s (following linkfile) failed (%s)", local->loc.path, subvol->name, strerror (op_errno)); - goto err; + + /* If cached subvol returned ENOTCONN, do not do + lookup_everywhere. We need to make sure linkfile does not get + removed, which can take away the namespace, and subvol is + anyways down. */ + + if (op_errno != ENOTCONN) + goto err; + else + goto unwind; } if (check_is_dir (inode, stbuf, xattr)) { @@ -799,7 +1132,7 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, goto err; } - if (check_is_linkfile (inode, stbuf, xattr)) { + if (check_is_linkfile (inode, stbuf, xattr, conf->link_xattr_name)) { gf_log (this->name, GF_LOG_INFO, "lookup of %s on %s (following linkfile) reached link", local->loc.path, subvol->name); @@ -827,9 +1160,13 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, op_errno = EINVAL; } -unwind: - WIPE (postparent); + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); + } +unwind: + DHT_STRIP_PHASE1_FLAGS (stbuf); DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, xattr, postparent); @@ -873,9 +1210,13 @@ dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc) local->xattr = NULL; } - if (!uuid_is_null (local->gfid)) + if (!uuid_is_null (local->gfid)) { ret = dict_set_static_bin (local->xattr_req, "gfid-req", local->gfid, 16); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set gfid", local->loc.path); + } for (i = 0; i < call_cnt; i++) { STACK_WIND (frame, dht_lookup_dir_cbk, @@ -906,7 +1247,6 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, loc_t *loc = NULL; call_frame_t *prev = NULL; int ret = 0; - uint64_t tmp_layout = 0; dht_layout_t *parent_layout = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); @@ -937,8 +1277,10 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } if ((conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) && (loc->parent)) { - ret = inode_ctx_get (loc->parent, this, &tmp_layout); - parent_layout = (dht_layout_t *)(long)tmp_layout; + ret = dht_inode_ctx_layout_get (loc->parent, this, + &parent_layout); + if (ret || !parent_layout) + goto out; if (parent_layout->search_unhashed) { local->op_errno = ENOENT; dht_lookup_everywhere (frame, this, loc); @@ -967,7 +1309,8 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); if (!is_linkfile) { /* non-directory and not a linkfile */ @@ -1007,14 +1350,51 @@ out: * from each of the subvolume. See dht_iatt_merge for reference. */ - WIPE (postparent); + if (!op_ret && local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); + } + DHT_STRIP_PHASE1_FLAGS (stbuf); DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, xattr, postparent); err: return 0; } +/* For directories, check if acl xattrs have been requested (by the acl xlator), + * if not, request for them. These xattrs are needed for dht dir self-heal to + * perform proper self-healing of dirs + */ +void +dht_check_and_set_acl_xattr_req (inode_t *inode, dict_t *xattr_req) +{ + int ret = 0; + + GF_ASSERT (inode); + GF_ASSERT (xattr_req); + + if (inode->ia_type != IA_IFDIR) + return; + + if (!dict_get (xattr_req, POSIX_ACL_ACCESS_XATTR)) { + ret = dict_set_int8 (xattr_req, POSIX_ACL_ACCESS_XATTR, 0); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set key %s", + POSIX_ACL_ACCESS_XATTR); + } + + if (!dict_get (xattr_req, POSIX_ACL_DEFAULT_XATTR)) { + ret = dict_set_int8 (xattr_req, POSIX_ACL_DEFAULT_XATTR, 0); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set key %s", + POSIX_ACL_DEFAULT_XATTR); + } + + return; +} int dht_lookup (call_frame_t *frame, xlator_t *this, @@ -1022,7 +1402,6 @@ dht_lookup (call_frame_t *frame, xlator_t *this, { xlator_t *subvol = NULL; xlator_t *hashed_subvol = NULL; - xlator_t *cached_subvol = NULL; dht_local_t *local = NULL; dht_conf_t *conf = NULL; int ret = -1; @@ -1030,13 +1409,12 @@ dht_lookup (call_frame_t *frame, xlator_t *this, dht_layout_t *layout = NULL; int i = 0; int call_cnt = 0; - + loc_t new_loc = {0,}; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); conf = this->private; if (!conf) @@ -1047,9 +1425,17 @@ dht_lookup (call_frame_t *frame, xlator_t *this, op_errno = ENOMEM; goto err; } - if (!dht_filter_loc_subvol_key (this, loc, &local->loc, - &hashed_subvol)) { - ret = loc_dup (loc, &local->loc); + + ret = dht_filter_loc_subvol_key (this, loc, &new_loc, + &hashed_subvol); + if (ret) { + loc_wipe (&local->loc); + ret = loc_dup (&new_loc, &local->loc); + + /* we no more need 'new_loc' entries */ + loc_wipe (&new_loc); + + /* check if loc_dup() is successful */ if (ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_DEBUG, @@ -1065,8 +1451,13 @@ dht_lookup (call_frame_t *frame, xlator_t *this, local->xattr_req = dict_new (); } + if (uuid_is_null (loc->pargfid) && !uuid_is_null (loc->gfid) && + !__is_root_gfid (loc->inode->gfid)) { + local->cached_subvol = NULL; + dht_discover (frame, this, loc); + return 0; + } - cached_subvol = local->cached_subvol; if (!hashed_subvol) hashed_subvol = dht_subvol_get_hashed (this, loc); local->hashed_subvol = hashed_subvol; @@ -1094,43 +1485,58 @@ dht_lookup (call_frame_t *frame, xlator_t *this, local->inode = inode_ref (loc->inode); - call_cnt = local->call_cnt = layout->cnt; - /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute, * revalidates directly go to the cached-subvolume. */ ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); + + if (IA_ISDIR (local->inode->ia_type)) { + local->call_cnt = call_cnt = conf->subvolume_cnt; + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_revalidate_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + loc, local->xattr_req); + } + return 0; + } + + call_cnt = local->call_cnt = layout->cnt; /* need it for self-healing linkfiles which is 'in-migration' state */ ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_OPEN_FD_COUNT, 4); - for (i = 0; i < layout->cnt; i++) { + /* need it for dir self-heal */ + dht_check_and_set_acl_xattr_req (loc->inode, local->xattr_req); + + for (i = 0; i < call_cnt; i++) { subvol = layout->list[i].xlator; STACK_WIND (frame, dht_revalidate_cbk, subvol, subvol->fops->lookup, &local->loc, local->xattr_req); - if (!--call_cnt) - break; } } else { do_fresh_lookup: /* TODO: remove the hard-coding */ ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); ret = dict_set_uint32 (local->xattr_req, - DHT_LINKFILE_KEY, 256); + conf->link_xattr_name, 256); /* need it for self-healing linkfiles which is 'in-migration' state */ ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_OPEN_FD_COUNT, 4); + /* need it for dir self-heal */ + dht_check_and_set_acl_xattr_req (loc->inode, local->xattr_req); + if (!hashed_subvol) { gf_log (this->name, GF_LOG_DEBUG, "no subvolume in layout for path=%s, " @@ -1164,7 +1570,8 @@ dht_lookup (call_frame_t *frame, xlator_t *this, err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, + NULL); return 0; } @@ -1172,7 +1579,7 @@ err: int dht_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -1196,14 +1603,18 @@ dht_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->postparent = *postparent; local->preparent = *preparent; - WIPE (&local->postparent); - WIPE (&local->preparent); + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->preparent, 0); + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } } unlock: UNLOCK (&frame->lock); DHT_STACK_UNWIND (unlink, frame, local->op_ret, local->op_errno, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, NULL); return 0; } @@ -1212,7 +1623,7 @@ unlock: int dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -1224,7 +1635,8 @@ dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, LOCK (&frame->lock); { - if (op_ret == -1) { + if ((op_ret == -1) && !((op_errno == ENOENT) || + (op_errno == ENOTCONN))) { local->op_errno = op_errno; gf_log (this->name, GF_LOG_DEBUG, "subvolume %s returned -1 (%s)", @@ -1237,7 +1649,7 @@ dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, unlock: UNLOCK (&frame->lock); - if (op_ret == -1) + if (local->op_ret == -1) goto err; cached_subvol = dht_subvol_get_cached (this, local->loc.inode); @@ -1251,21 +1663,19 @@ unlock: STACK_WIND (frame, dht_unlink_cbk, cached_subvol, cached_subvol->fops->unlink, - &local->loc); + &local->loc, local->flags, NULL); return 0; err: DHT_STACK_UNWIND (unlink, frame, -1, local->op_errno, - NULL, NULL); + NULL, NULL, NULL); return 0; } - - int dht_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; @@ -1291,7 +1701,8 @@ unlock: this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (setxattr, frame, local->op_ret, local->op_errno); + DHT_STACK_UNWIND (setxattr, frame, local->op_ret, + local->op_errno, NULL); } return 0; @@ -1314,114 +1725,249 @@ fill_layout_info (dht_layout_t *layout, char *buf) } } +void +dht_fill_pathinfo_xattr (xlator_t *this, dht_local_t *local, + char *xattr_buf, int32_t alloc_len, + int flag, char *layout_buf) +{ + if (flag && local->xattr_val) + snprintf (xattr_buf, alloc_len, + "((<"DHT_PATHINFO_HEADER"%s> %s) (%s-layout %s))", + this->name, local->xattr_val, this->name, + layout_buf); + else if (local->xattr_val) + snprintf (xattr_buf, alloc_len, + "(<"DHT_PATHINFO_HEADER"%s> %s)", + this->name, local->xattr_val); + else if (flag) + snprintf (xattr_buf, alloc_len, "(%s-layout %s)", + this->name, layout_buf); +} + int -dht_pathinfo_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr) +dht_vgetxattr_alloc_and_fill (dht_local_t *local, dict_t *xattr, xlator_t *this, + int op_errno) +{ + int ret = -1; + char *value = NULL; + int32_t plen = 0; + + ret = dict_get_str (xattr, local->xsel, &value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Subvolume %s returned -1 (%s)", this->name, + strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + goto out; + } + + local->alloc_len += strlen(value); + + if (!local->xattr_val) { + local->alloc_len += (strlen (DHT_PATHINFO_HEADER) + 10); + local->xattr_val = GF_CALLOC (local->alloc_len, sizeof (char), + gf_common_mt_char); + if (!local->xattr_val) { + ret = -1; + goto out; + } + } + + if (local->xattr_val) { + plen = strlen (local->xattr_val); + if (plen) { + /* extra byte(s) for \0 to be safe */ + local->alloc_len += (plen + 2); + local->xattr_val = GF_REALLOC (local->xattr_val, + local->alloc_len); + if (!local->xattr_val) { + ret = -1; + goto out; + } + } + + (void) strcat (local->xattr_val, value); + (void) strcat (local->xattr_val, " "); + local->op_ret = 0; + } + + ret = 0; + + out: + return ret; +} + +int +dht_vgetxattr_fill_and_set (dht_local_t *local, dict_t **dict, xlator_t *this, + gf_boolean_t flag) +{ + int ret = -1; + char *xattr_buf = NULL; + char layout_buf[8192] = {0,}; + + if (flag) + fill_layout_info (local->layout, layout_buf); + + *dict = dict_new (); + if (!*dict) + goto out; + + local->xattr_val[strlen (local->xattr_val) - 1] = '\0'; + + /* we would need max this many bytes to create xattr string + * extra 40 bytes is just an estimated amount of additional + * space required as we include translator name and some + * spaces, brackets etc. when forming the pathinfo string. + * + * For node-uuid we just don't have all the pretty formatting, + * but since this is a generic routine for pathinfo & node-uuid + * we dont have conditional space allocation and try to be + * generic + */ + local->alloc_len += (2 * strlen (this->name)) + + strlen (layout_buf) + + 40; + xattr_buf = GF_CALLOC (local->alloc_len, sizeof (char), + gf_common_mt_char); + if (!xattr_buf) + goto out; + + if (XATTR_IS_PATHINFO (local->xsel)) { + (void) dht_fill_pathinfo_xattr (this, local, xattr_buf, + local->alloc_len, flag, + layout_buf); + } else if (XATTR_IS_NODE_UUID (local->xsel)) { + (void) snprintf (xattr_buf, local->alloc_len, "%s", + local->xattr_val); + } else { + gf_log (this->name, GF_LOG_WARNING, + "Unknown local->xsel (%s)", local->xsel); + goto out; + } + + ret = dict_set_dynstr (*dict, local->xsel, xattr_buf); + GF_FREE (local->xattr_val); + + out: + return ret; +} + +int +dht_vgetxattr_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - dht_local_t *local = NULL; int ret = 0; - int flag = 0; + dht_local_t *local = NULL; int this_call_cnt = 0; - char *value_got = NULL; - char layout_buf[8192] = {0,}; - char *xattr_buf = NULL; dict_t *dict = NULL; - int32_t alloc_len = 0; - int32_t plen = 0; - local = frame->local; + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (frame->local, out); - if (op_ret != -1) { - ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &value_got); - if (!ret) { - alloc_len = strlen (value_got); + local = frame->local; - /** - * allocate the buffer:- we allocate 10 bytes extra in case we need to - * append ' Link: ' in the buffer for another STACK_WIND - */ - if (!local->pathinfo) { - alloc_len += (strlen (DHT_PATHINFO_HEADER) + 10); - local->pathinfo = GF_CALLOC (alloc_len, sizeof (char), gf_common_mt_char); + LOCK (&frame->lock); + { + this_call_cnt = --local->call_cnt; + if (op_ret < 0) { + if (op_errno != ENOTCONN) { + gf_log (this->name, GF_LOG_ERROR, + "getxattr err (%s) for dir", + strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; } - if (local->pathinfo) { - plen = strlen (local->pathinfo); - if (plen) { - /* extra byte(s) for \0 to be safe */ - alloc_len += (plen + 2); - local->pathinfo = GF_REALLOC (local->pathinfo, - alloc_len); - if (!local->pathinfo) - goto out; - } - - strcat (local->pathinfo, value_got); - } + goto unlock; } + + ret = dht_vgetxattr_alloc_and_fill (local, xattr, this, + op_errno); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "alloc or fill failure"); } + unlock: + UNLOCK (&frame->lock); - out: - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - if (local->layout->cnt > 1) { - /* Set it for directory */ - fill_layout_info (local->layout, layout_buf); - flag = 1; - } + if (!is_last_call (this_call_cnt)) + goto out; - dict = dict_new (); + /* -- last call: do patch ups -- */ - /* we would need max-to-max this many bytes to create pathinfo string */ - alloc_len += (2 * strlen (this->name)) + strlen (layout_buf) + 40; - xattr_buf = GF_CALLOC (alloc_len, sizeof (char), gf_common_mt_char); + if (local->op_ret == -1) { + goto unwind; + } - if (flag && local->pathinfo) - snprintf (xattr_buf, alloc_len, "((<"DHT_PATHINFO_HEADER"%s> %s) (%s-layout %s))", - this->name, local->pathinfo, this->name, - layout_buf); - else if (local->pathinfo) - snprintf (xattr_buf, alloc_len, "(<"DHT_PATHINFO_HEADER"%s> %s)", - this->name, local->pathinfo); - else if (flag) - snprintf (xattr_buf, alloc_len, "(%s-layout %s)", - this->name, layout_buf); + ret = dht_vgetxattr_fill_and_set (local, &dict, this, _gf_true); + if (ret) + goto unwind; - ret = dict_set_dynstr (dict, GF_XATTR_PATHINFO_KEY, - xattr_buf); + DHT_STACK_UNWIND (getxattr, frame, 0, 0, dict, xdata); + goto cleanup; - if (local->pathinfo) - GF_FREE (local->pathinfo); + unwind: + DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, NULL, NULL); + cleanup: + if (dict) + dict_unref (dict); + out: + return 0; +} - DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); +int +dht_vgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + dht_local_t *local = NULL; + int ret = 0; + dict_t *dict = NULL; + call_frame_t *prev = NULL; + gf_boolean_t flag = _gf_true; - if (dict) - dict_unref (dict); + local = frame->local; + prev = cookie; - return 0; + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, "Subvolume %s returned -1 " + "(%s)", prev->this->name, strerror (op_errno)); + goto unwind; } - if (local->pathinfo) - strcat (local->pathinfo, " Link: "); - if (local->hashed_subvol) { - /* This will happen if there pending */ - STACK_WIND (frame, dht_pathinfo_getxattr_cbk, local->hashed_subvol, - local->hashed_subvol->fops->getxattr, - &local->loc, local->key); - - return 0; + ret = dht_vgetxattr_alloc_and_fill (local, xattr, this, + op_errno); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "alloc or fill failure"); + goto unwind; } - gf_log ("this->name", GF_LOG_ERROR, "Unable to find hashed_subvol for path" - " %s", local->pathinfo); + flag = (local->layout->cnt > 1) ? _gf_true : _gf_false; + + ret = dht_vgetxattr_fill_and_set (local, &dict, this, flag); + if (ret) + goto unwind; + + DHT_STACK_UNWIND (getxattr, frame, 0, 0, dict, xdata); + goto cleanup; + + unwind: + DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, + NULL, NULL); + cleanup: + if (dict) + dict_unref (dict); - DHT_STACK_UNWIND (getxattr, frame, -1, op_errno, dict); return 0; } int dht_linkinfo_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr) + int op_ret, int op_errno, dict_t *xattr, + dict_t *xdata) { int ret = 0; char *value = NULL; @@ -1436,21 +1982,24 @@ dht_linkinfo_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } } - DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr); + DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata); return 0; } int dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr) + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { int this_call_cnt = 0; dht_local_t *local = NULL; + dht_conf_t *conf = NULL; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (frame->local, out); + VALIDATE_OR_GOTO (this->private, out); + conf = this->private; local = frame->local; this_call_cnt = dht_frame_return (frame); @@ -1458,8 +2007,8 @@ dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (!xattr || (op_ret == -1)) goto out; - if (dict_get (xattr, "trusted.glusterfs.dht")) { - dict_del (xattr, "trusted.glusterfs.dht"); + if (dict_get (xattr, conf->xattr_name)) { + dict_del (xattr, conf->xattr_name); } local->op_ret = 0; @@ -1476,24 +2025,88 @@ dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } out: if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno, local->xattr); + DHT_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno, + local->xattr, NULL); } return 0; } int32_t dht_getxattr_unwind (call_frame_t *frame, - int op_ret, int op_errno, dict_t *dict) + int op_ret, int op_errno, dict_t *dict, dict_t *xdata) { - DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); + DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); return 0; } int +dht_getxattr_get_real_filename_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + dict_t *xattr, dict_t *xdata) +{ + int this_call_cnt = 0; + dht_local_t *local = NULL; + + + local = frame->local; + + if (op_ret != -1) { + if (local->xattr) + dict_unref (local->xattr); + local->xattr = dict_ref (xattr); + + if (local->xattr_req) + dict_unref (local->xattr_req); + local->xattr_req = dict_ref (xdata); + } + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + DHT_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno, + local->xattr, local->xattr_req); + } + + return 0; +} + + +int +dht_getxattr_get_real_filename (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *key, dict_t *xdata) +{ + dht_local_t *local = NULL; + int i = 0; + dht_layout_t *layout = NULL; + int cnt = 0; + xlator_t *subvol = NULL; + + + local = frame->local; + layout = local->layout; + + cnt = local->call_cnt = layout->cnt; + + local->op_ret = -1; + local->op_errno = ENODATA; + + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND (frame, dht_getxattr_get_real_filename_cbk, + subvol, subvol->fops->getxattr, + loc, key, xdata); + } + + return 0; +} + + +int dht_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *key) + loc_t *loc, const char *key, dict_t *xdata) +#define DHT_IS_DIR(layout) (layout->cnt > 1) { + xlator_t *subvol = NULL; xlator_t *hashed_subvol = NULL; xlator_t *cached_subvol = NULL; @@ -1509,7 +2122,6 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); VALIDATE_OR_GOTO (this->private, err); conf = this->private; @@ -1537,24 +2149,67 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, } } - if (key && (strcmp (key, GF_XATTR_PATHINFO_KEY) == 0)) { - hashed_subvol = dht_subvol_get_hashed (this, loc); + if (key && + (strncmp (key, GF_XATTR_GET_REAL_FILENAME_KEY, + strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0) + && DHT_IS_DIR(layout)) { + dht_getxattr_get_real_filename (frame, this, loc, key, xdata); + return 0; + } + + /* for file use cached subvolume (obviously!): see if {} + * below + * for directory: + * wind to all subvolumes and exclude subvolumes which + * return ENOTCONN (in callback) + * + * NOTE: Don't trust inode here, as that may not be valid + * (until inode_link() happens) + */ + if (key && DHT_IS_DIR(layout) && + ((strcmp (key, GF_XATTR_PATHINFO_KEY) == 0) + || (strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0))) { + (void) strncpy (local->xsel, key, 256); + cnt = local->call_cnt = layout->cnt; + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND (frame, dht_vgetxattr_dir_cbk, + subvol, subvol->fops->getxattr, + loc, key, NULL); + } + return 0; + } + + /* node-uuid or pathinfo for files */ + if (key && ((strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0) + || (strcmp (key, GF_XATTR_PATHINFO_KEY) == 0))) { cached_subvol = local->cached_subvol; + (void) strncpy (local->xsel, key, 256); local->call_cnt = 1; - if (hashed_subvol != cached_subvol) { - local->call_cnt = 2; - local->hashed_subvol = hashed_subvol; - } - - STACK_WIND (frame, dht_pathinfo_getxattr_cbk, cached_subvol, - cached_subvol->fops->getxattr, loc, key); + STACK_WIND (frame, dht_vgetxattr_cbk, cached_subvol, + cached_subvol->fops->getxattr, loc, key, NULL); return 0; } + if (key && (strcmp (key, GF_XATTR_LINKINFO_KEY) == 0)) { hashed_subvol = dht_subvol_get_hashed (this, loc); + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get" + "hashed subvol for %s", loc->path); + op_errno = EINVAL; + goto err; + } + cached_subvol = dht_subvol_get_cached (this, loc->inode); + if (!cached_subvol) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get" + "cached subvol for %s", loc->path); + op_errno = EINVAL; + goto err; + } + if (hashed_subvol == cached_subvol) { op_errno = ENODATA; goto err; @@ -1562,7 +2217,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, if (hashed_subvol) { STACK_WIND (frame, dht_linkinfo_getxattr_cbk, hashed_subvol, hashed_subvol->fops->getxattr, loc, - GF_XATTR_PATHINFO_KEY); + GF_XATTR_PATHINFO_KEY, NULL); return 0; } op_errno = ENODATA; @@ -1570,13 +2225,13 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, } if (key && (!strcmp (GF_XATTR_MARKER_KEY, key)) - && (-1 == frame->root->pid)) { - - if (loc->inode-> ia_type == IA_IFDIR) { + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { + if (DHT_IS_DIR(layout)) { cnt = layout->cnt; } else { cnt = 1; } + sub_volumes = alloca ( cnt * sizeof (xlator_t *)); for (i = 0; i < cnt; i++) *(sub_volumes + i) = layout->list[i].xlator; @@ -1584,7 +2239,8 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, if (cluster_getmarkerattr (frame, this, loc, key, local, dht_getxattr_unwind, sub_volumes, cnt, - MARKER_UUID_TYPE, conf->vol_uuid)) { + MARKER_UUID_TYPE, marker_uuid_default_gauge, + conf->vol_uuid)) { op_errno = EINVAL; goto err; } @@ -1594,8 +2250,8 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, if (key && *conf->vol_uuid) { if ((match_uuid_local (key, conf->vol_uuid) == 0) && - (-1 == frame->root->pid)) { - if (loc->inode-> ia_type == IA_IFDIR) { + (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { + if (DHT_IS_DIR(layout)) { cnt = layout->cnt; } else { cnt = 1; @@ -1608,6 +2264,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, local, dht_getxattr_unwind, sub_volumes, cnt, MARKER_XTIME_TYPE, + marker_xtime_default_gauge, conf->vol_uuid)) { op_errno = EINVAL; goto err; @@ -1617,7 +2274,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, } } - if (loc->inode-> ia_type == IA_IFDIR) { + if (DHT_IS_DIR(layout)) { cnt = local->call_cnt = layout->cnt; } else { cnt = local->call_cnt = 1; @@ -1627,29 +2284,100 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, subvol = layout->list[i].xlator; STACK_WIND (frame, dht_getxattr_cbk, subvol, subvol->fops->getxattr, - loc, key); + loc, key, NULL); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); + + return 0; +} +#undef DHT_IS_DIR + +int +dht_fgetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *key, dict_t *xdata) +{ + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int op_errno = -1; + int i = 0; + int cnt = 0; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + VALIDATE_OR_GOTO (this->private, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FGETXATTR); + if (!local) { + op_errno = ENOMEM; + + goto err; + } + + layout = local->layout; + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "layout is NULL"); + op_errno = ENOENT; + goto err; + } + + if (key) { + local->key = gf_strdup (key); + if (!local->key) { + op_errno = ENOMEM; + goto err; + } + } + + if ((fd->inode->ia_type == IA_IFDIR) + && (strncmp (key, GF_XATTR_LOCKINFO_KEY, + strlen (GF_XATTR_LOCKINFO_KEY) != 0))) { + cnt = local->call_cnt = layout->cnt; + } else { + cnt = local->call_cnt = 1; + } + + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND (frame, dht_getxattr_cbk, + subvol, subvol->fops->fgetxattr, + fd, key, NULL); + } + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fgetxattr, frame, -1, op_errno, NULL, NULL); return 0; } int dht_fsetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, dict_t *xattr, int flags) + fd_t *fd, dict_t *xattr, int flags, dict_t *xdata) { xlator_t *subvol = NULL; dht_local_t *local = NULL; int op_errno = EINVAL; + dht_conf_t *conf = NULL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); VALIDATE_OR_GOTO (fd->inode, err); + VALIDATE_OR_GOTO (this->private, err); + + conf = this->private; + + GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, + op_errno, err); local = dht_local_init (frame, NULL, fd, GF_FOP_FSETXATTR); if (!local) { @@ -1668,13 +2396,13 @@ dht_fsetxattr (call_frame_t *frame, xlator_t *this, local->call_cnt = 1; STACK_WIND (frame, dht_err_cbk, subvol, subvol->fops->fsetxattr, - fd, xattr, flags); + fd, xattr, flags, NULL); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fsetxattr, frame, -1, op_errno); + DHT_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); return 0; } @@ -1682,16 +2410,18 @@ err: static int dht_common_setxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xdata) { - DHT_STACK_UNWIND (setxattr, frame, op_ret, op_errno); + DHT_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); return 0; } int dht_checking_pathinfo_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr) + int op_ret, int op_errno, dict_t *xattr, + dict_t *xdata) { int i = -1; int ret = -1; @@ -1723,7 +2453,7 @@ dht_checking_pathinfo_cbk (call_frame_t *frame, void *cookie, xlator_t *this, out: this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (setxattr, frame, local->op_ret, ENOTSUP); + DHT_STACK_UNWIND (setxattr, frame, local->op_ret, ENOTSUP, NULL); } return 0; @@ -1731,7 +2461,7 @@ out: int dht_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr, int flags) + loc_t *loc, dict_t *xattr, int flags, dict_t *xdata) { xlator_t *subvol = NULL; dht_local_t *local = NULL; @@ -1743,16 +2473,19 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, data_t *tmp = NULL; uint32_t dir_spread = 0; char value[4096] = {0,}; - int forced_rebalance = 0; - + gf_dht_migrate_data_type_t forced_rebalance = GF_DHT_MIGRATE_DATA; + int call_cnt = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); conf = this->private; + + GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, + op_errno, err); + local = dht_local_init (frame, loc, NULL, GF_FOP_SETXATTR); if (!local) { op_errno = ENOMEM; @@ -1775,9 +2508,11 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, goto err; } + local->call_cnt = call_cnt = layout->cnt; + tmp = dict_get (xattr, "distribute.migrate-data"); if (tmp) { - if (!IA_ISREG (loc->inode->ia_type)) { + if (IA_ISDIR (loc->inode->ia_type)) { op_errno = ENOTSUP; goto err; } @@ -1786,9 +2521,20 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, (ie, 'target' subvolume given there, etc) */ memcpy (value, tmp->data, tmp->len); if (strcmp (value, "force") == 0) - forced_rebalance = 1; + forced_rebalance = + GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS; + + if (conf->decommission_in_progress) + forced_rebalance = GF_DHT_MIGRATE_HARDLINK; local->rebalance.target_node = dht_subvol_get_hashed (this, loc); + if (!local->rebalance.target_node) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "hashed subvol for %s", loc->path); + op_errno = EINVAL; + goto err; + } + local->rebalance.from_subvol = local->cached_subvol; if (local->rebalance.target_node == local->rebalance.from_subvol) { @@ -1814,7 +2560,7 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, tmp = dict_get (xattr, "decommission-brick"); if (tmp) { /* This operation should happen only on '/' */ - if (__is_root_gfid (loc->inode->gfid) != 0) { + if (!__is_root_gfid (loc->inode->gfid)) { op_errno = ENOTSUP; goto err; } @@ -1828,7 +2574,7 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, dht_checking_pathinfo_cbk, conf->subvolumes[i], conf->subvolumes[i]->fops->getxattr, - loc, GF_XATTR_PATHINFO_KEY); + loc, GF_XATTR_PATHINFO_KEY, NULL); } return 0; } @@ -1838,9 +2584,13 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, gf_log (this->name, GF_LOG_INFO, "fixing the layout of %s", loc->path); - dht_fix_directory_layout (frame, dht_common_setxattr_cbk, - layout); - return 0; + ret = dht_fix_directory_layout (frame, dht_common_setxattr_cbk, + layout); + if (ret) { + op_errno = ENOTCONN; + goto err; + } + return ret; } tmp = dict_get (xattr, "distribute.directory-spread-count"); @@ -1852,10 +2602,14 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, (dir_spread > 0))) { layout->spread_cnt = dir_spread; - dht_fix_directory_layout (frame, - dht_common_setxattr_cbk, - layout); - return 0; + ret = dht_fix_directory_layout (frame, + dht_common_setxattr_cbk, + layout); + if (ret) { + op_errno = ENOTCONN; + goto err; + } + return ret; } gf_log (this->name, GF_LOG_ERROR, "wrong 'directory-spread-count' value (%s)", value); @@ -1863,20 +2617,18 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, goto err; } - local->call_cnt = layout->cnt; - - for (i = 0; i < layout->cnt; i++) { + for (i = 0; i < call_cnt; i++) { STACK_WIND (frame, dht_err_cbk, layout->list[i].xlator, layout->list[i].xlator->fops->setxattr, - loc, xattr, flags); + loc, xattr, flags, xdata); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (setxattr, frame, -1, op_errno); + DHT_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); return 0; } @@ -1884,7 +2636,7 @@ err: int dht_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; @@ -1910,7 +2662,8 @@ unlock: this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (removexattr, frame, local->op_ret, local->op_errno); + DHT_STACK_UNWIND (removexattr, frame, local->op_ret, + local->op_errno, NULL); } return 0; @@ -1919,20 +2672,27 @@ unlock: int dht_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *key) + loc_t *loc, const char *key, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; dht_local_t *local = NULL; dht_layout_t *layout = NULL; + int call_cnt = 0; + dht_conf_t *conf = NULL; int i; - VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (this->private, err); + + conf = this->private; + + GF_IF_NATIVE_XATTR_GOTO (conf->wild_xattr_name, key, op_errno, err); + + VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); local = dht_local_init (frame, loc, NULL, GF_FOP_REMOVEXATTR); if (!local) { @@ -1956,21 +2716,85 @@ dht_removexattr (call_frame_t *frame, xlator_t *this, goto err; } - local->call_cnt = layout->cnt; + local->call_cnt = call_cnt = layout->cnt; local->key = gf_strdup (key); - for (i = 0; i < layout->cnt; i++) { + for (i = 0; i < call_cnt; i++) { STACK_WIND (frame, dht_removexattr_cbk, layout->list[i].xlator, layout->list[i].xlator->fops->removexattr, - loc, key); + loc, key, NULL); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (removexattr, frame, -1, op_errno); + DHT_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); + + return 0; +} + +int +dht_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *key, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int call_cnt = 0; + dht_conf_t *conf = 0; + + int i; + + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (this->private, err); + + conf = this->private; + + GF_IF_NATIVE_XATTR_GOTO (conf->wild_xattr_name, key, op_errno, err); + + VALIDATE_OR_GOTO (frame, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FREMOVEXATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for inode=%s", + uuid_utoa (fd->inode->gfid)); + op_errno = EINVAL; + goto err; + } + + layout = local->layout; + if (!local->layout) { + gf_log (this->name, GF_LOG_DEBUG, + "no layout for inode=%s", uuid_utoa (fd->inode->gfid)); + op_errno = EINVAL; + goto err; + } + + local->call_cnt = call_cnt = layout->cnt; + local->key = gf_strdup (key); + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_removexattr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->fremovexattr, + fd, key, NULL); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fremovexattr, frame, -1, op_errno, NULL); return 0; } @@ -1978,7 +2802,7 @@ err: int dht_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd) + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; @@ -2005,7 +2829,7 @@ unlock: this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) DHT_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, - local->fd); + local->fd, NULL); return 0; } @@ -2035,7 +2859,7 @@ dht_normalize_stats (struct statvfs *buf, unsigned long bsize, int dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct statvfs *statvfs) + int op_ret, int op_errno, struct statvfs *statvfs, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; @@ -2081,14 +2905,14 @@ unlock: this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) DHT_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno, - &local->statvfs); + &local->statvfs, xdata); return 0; } int -dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) +dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { xlator_t *subvol = NULL; dht_local_t *local = NULL; @@ -2100,7 +2924,6 @@ dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); VALIDATE_OR_GOTO (this->private, err); conf = this->private; @@ -2117,7 +2940,8 @@ dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) for (i = 0; i < conf->subvolume_cnt; i++) { STACK_WIND (frame, dht_statfs_cbk, conf->subvolumes[i], - conf->subvolumes[i]->fops->statfs, loc); + conf->subvolumes[i]->fops->statfs, loc, + xdata); } return 0; } @@ -2133,20 +2957,21 @@ dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) local->call_cnt = 1; STACK_WIND (frame, dht_statfs_cbk, - subvol, subvol->fops->statfs, loc); + subvol, subvol->fops->statfs, loc, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (statfs, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); return 0; } int -dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) +dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; @@ -2173,14 +2998,14 @@ dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) STACK_WIND (frame, dht_fd_cbk, conf->subvolumes[i], conf->subvolumes[i]->fops->opendir, - loc, fd); + loc, fd, xdata); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (opendir, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (opendir, frame, -1, op_errno, NULL, NULL); return 0; } @@ -2188,7 +3013,7 @@ err: int dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, - int op_errno, gf_dirent_t *orig_entries) + int op_errno, gf_dirent_t *orig_entries, dict_t *xdata) { dht_local_t *local = NULL; gf_dirent_t entries; @@ -2201,6 +3026,7 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, dht_layout_t *layout = 0; dht_conf_t *conf = NULL; xlator_t *subvol = 0; + int ret = 0; INIT_LIST_HEAD (&entries.list); prev = cookie; @@ -2217,10 +3043,13 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, list_for_each_entry (orig_entry, (&orig_entries->list), list) { next_offset = orig_entry->d_off; - - if (check_is_linkfile_wo_dict (NULL, (&orig_entry->d_stat)) - || (check_is_dir (NULL, (&orig_entry->d_stat), NULL) - && (prev->this != dht_first_up_subvol (this)))) { + if (check_is_dir (NULL, (&orig_entry->d_stat), NULL) && + (prev->this != local->first_up_subvol)) { + continue; + } + if (check_is_linkfile (NULL, (&orig_entry->d_stat), + orig_entry->dict, + conf->link_xattr_name)) { continue; } @@ -2236,7 +3065,7 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, orig_entry->d_name); if (!subvol || (subvol != prev->this)) { /* TODO: Count the number of entries which need - linkfile to prove its existance in fs */ + linkfile to prove its existence in fs */ layout->search_unhashed++; } } @@ -2249,6 +3078,24 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, entry->d_type = orig_entry->d_type; entry->d_len = orig_entry->d_len; + if (orig_entry->dict) + entry->dict = dict_ref (orig_entry->dict); + + /* making sure we set the inode ctx right with layout, + currently possible only for non-directories, so for + directories don't set entry inodes */ + if (!IA_ISDIR(entry->d_stat.ia_type)) { + ret = dht_layout_preset (this, prev->this, + orig_entry->inode); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "failed to link the layout in inode"); + entry->inode = inode_ref (orig_entry->inode); + } else if (orig_entry->inode) { + dht_inode_ctx_time_update (orig_entry->inode, this, + &entry->d_stat, 1); + } + list_add_tail (&entry->list, &entries.list); count++; } @@ -2278,9 +3125,23 @@ done: goto unwind; } + if (conf->readdir_optimize == _gf_true) { + if (next_subvol != local->first_up_subvol) { + ret = dict_set_int32 (local->xattr, + GF_READDIR_SKIP_DIRS, 1); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "dict set failed"); + } else { + dict_del (local->xattr, + GF_READDIR_SKIP_DIRS); + } + } + STACK_WIND (frame, dht_readdirp_cbk, next_subvol, next_subvol->fops->readdirp, - local->fd, local->size, next_offset); + local->fd, local->size, next_offset, + local->xattr); return 0; } @@ -2288,7 +3149,7 @@ unwind: if (op_ret < 0) op_ret = 0; - DHT_STACK_UNWIND (readdirp, frame, op_ret, op_errno, &entries); + DHT_STACK_UNWIND (readdirp, frame, op_ret, op_errno, &entries, NULL); gf_dirent_free (&entries); @@ -2299,7 +3160,8 @@ unwind: int dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *orig_entries) + int op_ret, int op_errno, gf_dirent_t *orig_entries, + dict_t *xdata) { dht_local_t *local = NULL; gf_dirent_t entries; @@ -2376,7 +3238,7 @@ done: STACK_WIND (frame, dht_readdir_cbk, next_subvol, next_subvol->fops->readdir, - local->fd, local->size, next_offset); + local->fd, local->size, next_offset, NULL); return 0; } @@ -2384,7 +3246,7 @@ unwind: if (op_ret < 0) op_ret = 0; - DHT_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries); + DHT_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries, NULL); gf_dirent_free (&entries); @@ -2394,17 +3256,21 @@ unwind: int dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t yoff, int whichop) + off_t yoff, int whichop, dict_t *dict) { dht_local_t *local = NULL; int op_errno = -1; xlator_t *xvol = NULL; off_t xoff = 0; - + int ret = 0; + dht_conf_t *conf = NULL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (this->private, err); + + conf = this->private; local = dht_local_init (frame, NULL, NULL, whichop); if (!local) { @@ -2414,22 +3280,52 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, local->fd = fd_ref (fd); local->size = size; + local->xattr_req = (dict)? dict_ref (dict) : NULL; + local->first_up_subvol = dht_first_up_subvol (this); dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff); /* TODO: do proper readdir */ - if (whichop == GF_FOP_READDIR) - STACK_WIND (frame, dht_readdir_cbk, xvol, xvol->fops->readdir, - fd, size, xoff); - else + if (whichop == GF_FOP_READDIRP) { + if (dict) + local->xattr = dict_ref (dict); + else + local->xattr = dict_new (); + + if (local->xattr) { + ret = dict_set_uint32 (local->xattr, + conf->link_xattr_name, 256); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "failed to set '%s' key", + conf->link_xattr_name); + if (conf->readdir_optimize == _gf_true) { + if (xvol != local->first_up_subvol) { + ret = dict_set_int32 (local->xattr, + GF_READDIR_SKIP_DIRS, 1); + if (ret) + gf_log (this->name, + GF_LOG_ERROR, + "Dict set failed"); + } else { + dict_del (local->xattr, + GF_READDIR_SKIP_DIRS); + } + } + } + STACK_WIND (frame, dht_readdirp_cbk, xvol, xvol->fops->readdirp, - fd, size, xoff); + fd, size, xoff, local->xattr); + } else { + STACK_WIND (frame, dht_readdir_cbk, xvol, xvol->fops->readdir, + fd, size, xoff, local->xattr); + } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (readdir, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL); return 0; } @@ -2437,7 +3333,7 @@ err: int dht_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t yoff) + off_t yoff, dict_t *xdata) { int op = GF_FOP_READDIR; dht_conf_t *conf = NULL; @@ -2458,15 +3354,15 @@ dht_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, op = GF_FOP_READDIRP; out: - dht_do_readdir (frame, this, fd, size, yoff, op); + dht_do_readdir (frame, this, fd, size, yoff, op, 0); return 0; } int dht_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t yoff) + off_t yoff, dict_t *dict) { - dht_do_readdir (frame, this, fd, size, yoff, GF_FOP_READDIRP); + dht_do_readdir (frame, this, fd, size, yoff, GF_FOP_READDIRP, dict); return 0; } @@ -2474,7 +3370,7 @@ dht_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, int dht_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; @@ -2494,14 +3390,16 @@ dht_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (fsyncdir, frame, local->op_ret, local->op_errno); + DHT_STACK_UNWIND (fsyncdir, frame, local->op_ret, + local->op_errno, xdata); return 0; } int -dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync) +dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, + int datasync, dict_t *xdata) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; @@ -2528,14 +3426,14 @@ dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync) STACK_WIND (frame, dht_fsyncdir_cbk, conf->subvolumes[i], conf->subvolumes[i]->fops->fsyncdir, - fd, datasync); + fd, datasync, xdata); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fsyncdir, frame, -1, op_errno); + DHT_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); return 0; } @@ -2545,9 +3443,9 @@ int dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - call_frame_t *prev = NULL; + xlator_t *prev = NULL; int ret = -1; dht_local_t *local = NULL; @@ -2565,19 +3463,24 @@ dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, prev = cookie; if (local->loc.parent) { - WIPE (preparent); - WIPE (postparent); + + dht_inode_ctx_time_update (local->loc.parent, this, + preparent, 0); + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); } - ret = dht_layout_preset (this, prev->this, inode); + ret = dht_layout_preset (this, prev, inode); if (ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "could not set pre-set layout for subvolume %s", - prev->this->name); + prev? prev->name: NULL); op_ret = -1; op_errno = EINVAL; goto out; } + if (local->linked == _gf_true) + dht_linkfile_attr_heal (frame, this); out: /* * FIXME: ia_size and st_blocks of preparent and postparent do not have @@ -2586,9 +3489,9 @@ out: * corresponding values from each of the subvolume. * See dht_iatt_merge for reference. */ - - DHT_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, stbuf, preparent, - postparent); + DHT_STRIP_PHASE1_FLAGS (stbuf); + DHT_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, stbuf, + preparent, postparent, xdata); return 0; } @@ -2597,7 +3500,8 @@ dht_mknod_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; xlator_t *cached_subvol = NULL; @@ -2606,22 +3510,28 @@ dht_mknod_linkfile_create_cbk (call_frame_t *frame, void *cookie, goto err; local = frame->local; + if (!local || !local->cached_subvol) { + op_errno = EINVAL; + goto err; + } + cached_subvol = local->cached_subvol; - STACK_WIND (frame, dht_newfile_cbk, - cached_subvol, cached_subvol->fops->mknod, - &local->loc, local->mode, local->rdev, - local->params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)cached_subvol, + cached_subvol, cached_subvol->fops->mknod, + &local->loc, local->mode, local->rdev, local->umask, + local->params); return 0; err: - DHT_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); return 0; } int dht_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, dict_t *params) + loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *params) { xlator_t *subvol = NULL; int op_errno = -1; @@ -2653,11 +3563,13 @@ dht_mknod (call_frame_t *frame, xlator_t *this, gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev, params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, + subvol, subvol->fops->mknod, loc, mode, + rdev, umask, params); } else { - avail_subvol = dht_free_disk_available_subvol (this, subvol); + + avail_subvol = dht_free_disk_available_subvol (this, subvol, + local); if (avail_subvol != subvol) { /* Choose the minimum filled volume, and create the files there */ @@ -2666,17 +3578,18 @@ dht_mknod (call_frame_t *frame, xlator_t *this, local->cached_subvol = avail_subvol; local->mode = mode; local->rdev = rdev; - + local->umask = umask; dht_linkfile_create (frame, dht_mknod_linkfile_create_cbk, - avail_subvol, subvol, loc); + this, avail_subvol, subvol, loc); } else { gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev, params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, + (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, + rdev, umask, params); } } @@ -2685,7 +3598,7 @@ dht_mknod (call_frame_t *frame, xlator_t *this, err: op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (mknod, frame, -1, op_errno, - NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); return 0; } @@ -2693,7 +3606,7 @@ err: int dht_symlink (call_frame_t *frame, xlator_t *this, - const char *linkname, loc_t *loc, dict_t *params) + const char *linkname, loc_t *loc, mode_t umask, dict_t *params) { xlator_t *subvol = NULL; int op_errno = -1; @@ -2721,23 +3634,24 @@ dht_symlink (call_frame_t *frame, xlator_t *this, gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->symlink, - linkname, loc, params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->symlink, linkname, loc, umask, + params); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (link, frame, -1, op_errno, - NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); return 0; } int -dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) +dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) { xlator_t *cached_subvol = NULL; xlator_t *hashed_subvol = NULL; @@ -2755,7 +3669,7 @@ dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) local->loc.path, cached_subvol->name, loc->path); STACK_WIND (frame, dht_unlink_cbk, cached_subvol, cached_subvol->fops->unlink, - &local->loc); + &local->loc, xflag, xdata); goto done; } @@ -2783,18 +3697,21 @@ dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) goto err; } + local->flags = xflag; if (hashed_subvol != cached_subvol) { STACK_WIND (frame, dht_unlink_linkfile_cbk, - hashed_subvol, hashed_subvol->fops->unlink, loc); + hashed_subvol, hashed_subvol->fops->unlink, loc, + xflag, xdata); } else { STACK_WIND (frame, dht_unlink_cbk, - cached_subvol, cached_subvol->fops->unlink, loc); + cached_subvol, cached_subvol->fops->unlink, loc, + xflag, xdata); } done: return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL); + DHT_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -2804,13 +3721,16 @@ int dht_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { call_frame_t *prev = NULL; dht_layout_t *layout = NULL; + dht_local_t *local = NULL; prev = cookie; + local = frame->local; + if (op_ret == -1) goto out; @@ -2824,12 +3744,20 @@ dht_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - WIPE (preparent); - WIPE (postparent); - + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + preparent, 0); + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); + } + if (local->linked == _gf_true) { + local->stbuf = *stbuf; + dht_linkfile_attr_heal (frame, this); + } out: + DHT_STRIP_PHASE1_FLAGS (stbuf); DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf, preparent, - postparent); + postparent, NULL); return 0; } @@ -2839,7 +3767,8 @@ int dht_link_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; xlator_t *srcvol = NULL; @@ -2851,13 +3780,14 @@ dht_link_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, srcvol = local->linkfile.srcvol; STACK_WIND (frame, dht_link_cbk, srcvol, srcvol->fops->link, - &local->loc, &local->loc2); + &local->loc, &local->loc2, xdata); return 0; err: + DHT_STRIP_PHASE1_FLAGS (stbuf); DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf, preparent, - postparent); + postparent, NULL); return 0; } @@ -2865,7 +3795,7 @@ err: int dht_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) + loc_t *oldloc, loc_t *newloc, dict_t *xdata) { xlator_t *cached_subvol = NULL; xlator_t *hashed_subvol = NULL; @@ -2910,19 +3840,19 @@ dht_link (call_frame_t *frame, xlator_t *this, if (hashed_subvol != cached_subvol) { uuid_copy (local->gfid, oldloc->inode->gfid); - dht_linkfile_create (frame, dht_link_linkfile_cbk, + dht_linkfile_create (frame, dht_link_linkfile_cbk, this, cached_subvol, hashed_subvol, newloc); } else { STACK_WIND (frame, dht_link_cbk, cached_subvol, cached_subvol->fops->link, - oldloc, newloc); + oldloc, newloc, xdata); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); return 0; } @@ -2932,7 +3862,7 @@ int dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { call_frame_t *prev = NULL; int ret = -1; @@ -2951,8 +3881,11 @@ dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, prev = cookie; if (local->loc.parent) { - WIPE (preparent); - WIPE (postparent); + dht_inode_ctx_time_update (local->loc.parent, this, + preparent, 0); + + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); } ret = dht_layout_preset (this, prev->this, inode); @@ -2964,10 +3897,14 @@ dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, op_errno = EINVAL; goto out; } - + if (local->linked == _gf_true) { + local->stbuf = *stbuf; + dht_linkfile_attr_heal (frame, this); + } out: + DHT_STRIP_PHASE1_FLAGS (stbuf); DHT_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, stbuf, preparent, - postparent); + postparent, NULL); return 0; } @@ -2977,7 +3914,8 @@ dht_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; xlator_t *cached_subvol = NULL; @@ -2991,18 +3929,19 @@ dht_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, STACK_WIND (frame, dht_create_cbk, cached_subvol, cached_subvol->fops->create, &local->loc, local->flags, local->mode, - local->fd, local->params); + local->umask, local->fd, local->params); return 0; err: - DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL, NULL); return 0; } int dht_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) + mode_t umask, fd_t *fd, dict_t *params) { int op_errno = -1; xlator_t *subvol = NULL; @@ -3028,7 +3967,7 @@ dht_create (call_frame_t *frame, xlator_t *this, local->loc.path, subvol->name, loc->path); STACK_WIND (frame, dht_create_cbk, subvol, subvol->fops->create, - &local->loc, flags, mode, fd, params); + &local->loc, flags, mode, umask, fd, params); goto done; } @@ -3046,38 +3985,38 @@ dht_create (call_frame_t *frame, xlator_t *this, "creating %s on %s", loc->path, subvol->name); STACK_WIND (frame, dht_create_cbk, subvol, subvol->fops->create, - loc, flags, mode, fd, params); + loc, flags, mode, umask, fd, params); goto done; } /* Choose the minimum filled volume, and create the files there */ - avail_subvol = dht_free_disk_available_subvol (this, subvol); + avail_subvol = dht_free_disk_available_subvol (this, subvol, local); if (avail_subvol != subvol) { local->params = dict_ref (params); local->flags = flags; local->mode = mode; - + local->umask = umask; local->cached_subvol = avail_subvol; local->hashed_subvol = subvol; gf_log (this->name, GF_LOG_TRACE, "creating %s on %s (link at %s)", loc->path, avail_subvol->name, subvol->name); - dht_linkfile_create (frame, - dht_create_linkfile_create_cbk, - avail_subvol, subvol, loc); + dht_linkfile_create (frame, dht_create_linkfile_create_cbk, + this, avail_subvol, subvol, loc); goto done; } gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); STACK_WIND (frame, dht_create_cbk, subvol, subvol->fops->create, - loc, flags, mode, fd, params); + loc, flags, mode, umask, fd, params); done: return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL, NULL); return 0; } @@ -3086,7 +4025,7 @@ err: int dht_mkdir_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { dht_local_t *local = NULL; dht_layout_t *layout = NULL; @@ -3097,14 +4036,17 @@ dht_mkdir_selfheal_cbk (call_frame_t *frame, void *cookie, if (op_ret == 0) { dht_layout_set (this, local->inode, layout); if (local->loc.parent) { - WIPE (&local->preparent); - WIPE (&local->postparent); + dht_inode_ctx_time_update (local->loc.parent, this, + &local->preparent, 0); + + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); } } DHT_STACK_UNWIND (mkdir, frame, op_ret, op_errno, local->inode, &local->stbuf, &local->preparent, - &local->postparent); + &local->postparent, NULL); return 0; } @@ -3112,12 +4054,12 @@ dht_mkdir_selfheal_cbk (call_frame_t *frame, void *cookie, int dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; int ret = -1; - int subvol_filled = 0; + gf_boolean_t subvol_filled = _gf_false; call_frame_t *prev = NULL; dht_layout_t *layout = NULL; @@ -3133,9 +4075,21 @@ dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ret = dht_layout_merge (this, layout, prev->this, -1, ENOSPC, NULL); } else { + if (op_ret == -1 && op_errno == EEXIST) + /* Very likely just a race between mkdir and + self-heal (from lookup of a concurrent mkdir + attempt). + Ignore error for now. layout setting will + anyways fail if this was a different (old) + pre-existing different directory. + */ + op_ret = 0; ret = dht_layout_merge (this, layout, prev->this, op_ret, op_errno, NULL); } + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to merge layouts", local->loc.path); if (op_ret == -1) { local->op_errno = op_errno; @@ -3162,7 +4116,8 @@ int dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; int ret = -1; @@ -3190,6 +4145,12 @@ dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie, ret = dht_layout_merge (this, layout, prev->this, op_ret, op_errno, NULL); + /* TODO: we may have to return from the function + if layout merge fails. For now, lets just log an error */ + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to merge layouts", local->loc.path); + if (op_ret == -1) { local->op_errno = op_errno; goto err; @@ -3202,6 +4163,8 @@ dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie, local->call_cnt = conf->subvolume_cnt - 1; + if (uuid_is_null (local->loc.gfid)) + uuid_copy (local->loc.gfid, stbuf->ia_gfid); if (local->call_cnt == 0) { dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk, &local->loc, layout); @@ -3211,19 +4174,20 @@ dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie, continue; STACK_WIND (frame, dht_mkdir_cbk, conf->subvolumes[i], - conf->subvolumes[i]->fops->mkdir, - &local->loc, local->mode, local->params); + conf->subvolumes[i]->fops->mkdir, &local->loc, + local->mode, local->umask, local->params); } return 0; err: - DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL); return 0; } -int + int dht_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *params) + loc_t *loc, mode_t mode, mode_t umask, dict_t *params) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; @@ -3259,6 +4223,7 @@ dht_mkdir (call_frame_t *frame, xlator_t *this, local->hashed_subvol = hashed_subvol; local->mode = mode; + local->umask = umask; local->params = dict_ref (params); local->inode = inode_ref (loc->inode); @@ -3271,13 +4236,14 @@ dht_mkdir (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, dht_mkdir_hashed_cbk, hashed_subvol, hashed_subvol->fops->mkdir, - loc, mode, params); + loc, mode, umask, params); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL); return 0; } @@ -3285,14 +4251,87 @@ err: int dht_rmdir_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { dht_local_t *local = NULL; local = frame->local; DHT_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, NULL); + + return 0; +} + + +int +dht_rmdir_hashed_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + local->op_ret = -1; + if (op_errno != ENOENT && op_errno != EACCES) { + local->need_selfheal = 1; + } + + + gf_log (this->name, GF_LOG_DEBUG, + "rmdir on %s for %s failed (%s)", + prev->this->name, local->loc.path, + strerror (op_errno)); + goto unlock; + } + + dht_iatt_merge (this, &local->preparent, preparent, prev->this); + dht_iatt_merge (this, &local->postparent, postparent, + prev->this); + + } +unlock: + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + if (local->need_selfheal) { + local->layout = + dht_layout_get (this, local->loc.inode); + + /* TODO: neater interface needed below */ + local->stbuf.ia_type = local->loc.inode->ia_type; + + uuid_copy (local->gfid, local->loc.inode->gfid); + dht_selfheal_restore (frame, dht_rmdir_selfheal_cbk, + &local->loc, local->layout); + } else { + + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, + this, + &local->preparent, + 0); + + dht_inode_ctx_time_update (local->loc.parent, + this, + &local->postparent, + 1); + } + + DHT_STACK_UNWIND (rmdir, frame, local->op_ret, + local->op_errno, &local->preparent, + &local->postparent, NULL); + } + } return 0; } @@ -3301,11 +4340,12 @@ dht_rmdir_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int dht_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; call_frame_t *prev = NULL; + int done = 0; local = frame->local; prev = cookie; @@ -3316,8 +4356,9 @@ dht_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_errno = op_errno; local->op_ret = -1; - if (op_errno != ENOENT) + if (op_errno != ENOENT && op_errno != EACCES) { local->need_selfheal = 1; + } gf_log (this->name, GF_LOG_DEBUG, "rmdir on %s for %s failed (%s)", @@ -3326,6 +4367,8 @@ dht_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto unlock; } + /* Track if rmdir succeeded on atleast one subvol*/ + local->fop_succeeded = 1; dht_iatt_merge (this, &local->preparent, preparent, prev->this); dht_iatt_merge (this, &local->postparent, postparent, prev->this); @@ -3335,8 +4378,17 @@ unlock: this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - if (local->need_selfheal) { + + /* if local->hashed_subvol, we are yet to wind to hashed_subvol. */ + if (local->hashed_subvol && (this_call_cnt == 1)) { + done = 1; + } else if (!local->hashed_subvol && !this_call_cnt) { + done = 1; + } + + + if (done) { + if (local->need_selfheal && local->fop_succeeded) { local->layout = dht_layout_get (this, local->loc.inode); @@ -3346,15 +4398,34 @@ unlock: uuid_copy (local->gfid, local->loc.inode->gfid); dht_selfheal_restore (frame, dht_rmdir_selfheal_cbk, &local->loc, local->layout); - } else { + } else if (this_call_cnt) { + /* If non-hashed subvol's have responded, proceed */ + + local->need_selfheal = 0; + STACK_WIND (frame, dht_rmdir_hashed_subvol_cbk, + local->hashed_subvol, + local->hashed_subvol->fops->rmdir, + &local->loc, local->flags, NULL); + } else if (!this_call_cnt) { + /* All subvol's have responded, proceed */ + if (local->loc.parent) { - WIPE (&local->preparent); - WIPE (&local->postparent); + + dht_inode_ctx_time_update (local->loc.parent, + this, + &local->preparent, + 0); + + dht_inode_ctx_time_update (local->loc.parent, + this, + &local->postparent, + 1); + } DHT_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno, &local->preparent, - &local->postparent); + &local->postparent, NULL); } } @@ -3368,6 +4439,7 @@ dht_rmdir_do (call_frame_t *frame, xlator_t *this) dht_local_t *local = NULL; dht_conf_t *conf = NULL; int i = 0; + xlator_t *hashed_subvol = NULL; VALIDATE_OR_GOTO (this->private, err); @@ -3379,18 +4451,41 @@ dht_rmdir_do (call_frame_t *frame, xlator_t *this) local->call_cnt = conf->subvolume_cnt; + /* first remove from non-hashed_subvol */ + hashed_subvol = dht_subvol_get_hashed (this, &local->loc); + + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_WARNING, "failed to get hashed " + "subvol for %s",local->loc.path); + } else { + local->hashed_subvol = hashed_subvol; + } + + /* When DHT has only 1 child */ + if (conf->subvolume_cnt == 1) { + STACK_WIND (frame, dht_rmdir_hashed_subvol_cbk, + conf->subvolumes[0], + conf->subvolumes[0]->fops->rmdir, + &local->loc, local->flags, NULL); + return 0; + } + for (i = 0; i < conf->subvolume_cnt; i++) { + if (hashed_subvol && + (hashed_subvol == conf->subvolumes[i])) + continue; + STACK_WIND (frame, dht_rmdir_cbk, conf->subvolumes[i], conf->subvolumes[i]->fops->rmdir, - &local->loc, local->flags); + &local->loc, local->flags, NULL); } return 0; err: DHT_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, NULL); return 0; } @@ -3398,7 +4493,7 @@ err: int dht_rmdir_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -3446,6 +4541,7 @@ dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_frame_t *main_frame = NULL; dht_local_t *main_local = NULL; int this_call_cnt = 0; + dht_conf_t *conf = this->private; local = frame->local; prev = cookie; @@ -3457,7 +4553,7 @@ dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret != 0) goto err; - if (check_is_linkfile (inode, stbuf, xattr) == 0) { + if (!check_is_linkfile (inode, stbuf, xattr, conf->link_xattr_name)) { main_local->op_ret = -1; main_local->op_errno = ENOTEMPTY; @@ -3468,7 +4564,7 @@ dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } STACK_WIND (frame, dht_rmdir_linkfile_unlink_cbk, - src, src->fops->unlink, &local->loc); + src, src->fops->unlink, &local->loc, 0, NULL); return 0; err: @@ -3491,6 +4587,8 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, call_frame_t *lookup_frame = NULL; dht_local_t *lookup_local = NULL; dht_local_t *local = NULL; + dict_t *xattrs = NULL; + dht_conf_t *conf = this->private; local = frame->local; @@ -3499,7 +4597,8 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, continue; if (strcmp (trav->d_name, "..") == 0) continue; - if (check_is_linkfile (NULL, (&trav->d_stat), NULL) == 1) { + if (check_is_linkfile (NULL, (&trav->d_stat), trav->dict, + conf->link_xattr_name)) { ret++; continue; } @@ -3511,6 +4610,21 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, return 0; } + xattrs = dict_new (); + if (!xattrs) { + gf_log (this->name, GF_LOG_ERROR, "dict_new failed"); + return -1; + } + + ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to set linkto key" + " in dict"); + if (xattrs) + dict_unref (xattrs); + return -1; + } + list_for_each_entry (trav, &entries->list, list) { if (strcmp (trav->d_name, ".") == 0) continue; @@ -3527,8 +4641,7 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, goto err; } - lookup_local = GF_CALLOC (sizeof (*local), 1, - gf_dht_mt_dht_local_t); + lookup_local = mem_get0 (this->local_pool); if (!lookup_local) { goto err; } @@ -3541,6 +4654,8 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, if (build_ret != 0) goto err; + uuid_copy (lookup_local->loc.gfid, trav->d_stat.ia_gfid); + gf_log (this->name, GF_LOG_TRACE, "looking up %s on %s", lookup_local->loc.path, src->name); @@ -3553,12 +4668,18 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, STACK_WIND (lookup_frame, dht_rmdir_lookup_cbk, src, src->fops->lookup, - &lookup_local->loc, NULL); + &lookup_local->loc, xattrs); ret++; } + if (xattrs) + dict_unref (xattrs); + return ret; err: + if (xattrs) + dict_unref (xattrs); + DHT_STACK_DESTROY (lookup_frame); return 0; } @@ -3566,7 +4687,8 @@ err: int dht_rmdir_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *entries) + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = -1; @@ -3610,12 +4732,14 @@ dht_rmdir_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd) + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = -1; call_frame_t *prev = NULL; - + dict_t *dict = NULL; + int ret = 0; + dht_conf_t *conf = this->private; local = frame->local; prev = cookie; @@ -3625,14 +4749,32 @@ dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "opendir on %s for %s failed (%s)", prev->this->name, local->loc.path, strerror (op_errno)); + if (op_errno != ENOENT) { + local->op_ret = -1; + local->op_errno = op_errno; + } + goto err; + } + + dict = dict_new (); + if (!dict) { local->op_ret = -1; - local->op_errno = op_errno; + local->op_errno = ENOMEM; goto err; } + ret = dict_set_uint32 (dict, conf->link_xattr_name, 256); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set '%s' key", + local->loc.path, conf->link_xattr_name); + STACK_WIND (frame, dht_rmdir_readdirp_cbk, prev->this, prev->this->fops->readdirp, - local->fd, 4096, 0); + local->fd, 4096, 0, dict); + + if (dict) + dict_unref (dict); return 0; @@ -3648,7 +4790,8 @@ err: int -dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) +dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; @@ -3672,6 +4815,7 @@ dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) local->call_cnt = conf->subvolume_cnt; local->op_ret = 0; + local->fop_succeeded = 0; local->flags = flags; @@ -3686,7 +4830,7 @@ dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) STACK_WIND (frame, dht_rmdir_opendir_cbk, conf->subvolumes[i], conf->subvolumes[i]->fops->opendir, - loc, local->fd); + loc, local->fd, NULL); } return 0; @@ -3694,17 +4838,17 @@ dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) err: op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (rmdir, frame, -1, op_errno, - NULL, NULL); + NULL, NULL, NULL); return 0; } int dht_entrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - DHT_STACK_UNWIND (entrylk, frame, op_ret, op_errno); + DHT_STACK_UNWIND (entrylk, frame, op_ret, op_errno, xdata); return 0; } @@ -3712,7 +4856,7 @@ dht_entrylk_cbk (call_frame_t *frame, void *cookie, int dht_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -3722,7 +4866,6 @@ dht_entrylk (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); local = dht_local_init (frame, loc, NULL, GF_FOP_ENTRYLK); if (!local) { @@ -3742,13 +4885,13 @@ dht_entrylk (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, dht_entrylk_cbk, subvol, subvol->fops->entrylk, - volume, loc, basename, cmd, type); + volume, loc, basename, cmd, type, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (entrylk, frame, -1, op_errno); + DHT_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); return 0; } @@ -3756,10 +4899,10 @@ err: int dht_fentrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - DHT_STACK_UNWIND (fentrylk, frame, op_ret, op_errno); + DHT_STACK_UNWIND (fentrylk, frame, op_ret, op_errno, NULL); return 0; } @@ -3767,7 +4910,7 @@ dht_fentrylk_cbk (call_frame_t *frame, void *cookie, int dht_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type) + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -3786,13 +4929,13 @@ dht_fentrylk (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, dht_fentrylk_cbk, subvol, subvol->fops->fentrylk, - volume, fd, basename, cmd, type); + volume, fd, basename, cmd, type, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fentrylk, frame, -1, op_errno); + DHT_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); return 0; } @@ -3801,16 +4944,21 @@ err: int dht_forget (xlator_t *this, inode_t *inode) { - uint64_t tmp_layout = 0; + uint64_t ctx_int = 0; + dht_inode_ctx_t *ctx = NULL; dht_layout_t *layout = NULL; - inode_ctx_del (inode, this, &tmp_layout); + inode_ctx_del (inode, this, &ctx_int); - if (!tmp_layout) + if (!ctx_int) return 0; - layout = (dht_layout_t *)(long)tmp_layout; + ctx = (dht_inode_ctx_t *) (long) ctx_int; + + layout = ctx->layout; + ctx->layout = NULL; dht_layout_unref (this, layout); + GF_FREE (ctx); return 0; } @@ -3819,16 +4967,22 @@ dht_forget (xlator_t *this, inode_t *inode) int dht_notify (xlator_t *this, int event, void *data, ...) { - xlator_t *subvol = NULL; - int cnt = -1; - int i = -1; - dht_conf_t *conf = NULL; - int ret = -1; - int propagate = 0; + xlator_t *subvol = NULL; + int cnt = -1; + int i = -1; + dht_conf_t *conf = NULL; + int ret = -1; + int propagate = 0; + + int had_heard_from_all = 0; + int have_heard_from_all = 0; + struct timeval time = {0,}; + gf_defrag_info_t *defrag = NULL; + dict_t *dict = NULL; + gf_defrag_type cmd = 0; + dict_t *output = NULL; + va_list ap; - int had_heard_from_all = 0; - int have_heard_from_all = 0; - struct timeval time = {0,}; conf = this->private; if (!conf) @@ -3890,7 +5044,11 @@ dht_notify (xlator_t *this, int event, void *data, ...) if (conf->assert_no_child_down) { gf_log (this->name, GF_LOG_WARNING, "Received CHILD_DOWN. Exiting"); - exit(0); + if (conf->defrag) { + gf_defrag_stop (conf->defrag, NULL); + } else { + kill (getpid(), SIGTERM); + } } for (i = 0; i < conf->subvolume_cnt; i++) { @@ -3941,6 +5099,36 @@ dht_notify (xlator_t *this, int event, void *data, ...) UNLOCK (&conf->subvolume_lock); break; + case GF_EVENT_VOLUME_DEFRAG: + { + if (!conf->defrag) { + return ret; + } + defrag = conf->defrag; + + dict = data; + va_start (ap, data); + output = va_arg (ap, dict_t*); + + ret = dict_get_int32 (dict, "rebalance-command", + (int32_t*)&cmd); + if (ret) + return ret; + LOCK (&defrag->lock); + { + if (defrag->is_exiting) + goto unlock; + if (cmd == GF_DEFRAG_CMD_STATUS) + gf_defrag_status_get (defrag, output); + else if (cmd == GF_DEFRAG_CMD_STOP) + gf_defrag_stop (defrag, output); + } +unlock: + UNLOCK (&defrag->lock); + return 0; + break; + } + default: propagate = 1; break; @@ -3956,9 +5144,12 @@ dht_notify (xlator_t *this, int event, void *data, ...) /* if all subvols have reported status, no need to hide anything or wait for anything else. Just propagate blindly */ - if (have_heard_from_all) + if (have_heard_from_all) { propagate = 1; + } + + if (!had_heard_from_all && have_heard_from_all) { /* This is the first event which completes aggregation of events from all subvolumes. If at least one subvol @@ -3977,6 +5168,19 @@ dht_notify (xlator_t *this, int event, void *data, ...) /* continue to check other events for CHILD_UP */ } } + + /* rebalance is started with assert_no_child_down. So we do + * not need to handle CHILD_DOWN event here. + */ + if (conf->defrag) { + ret = gf_thread_create (&conf->defrag->th, NULL, + gf_defrag_start, this); + if (ret) { + conf->defrag = NULL; + GF_FREE (conf->defrag); + kill (getpid(), SIGTERM); + } + } } ret = 0; @@ -3985,3 +5189,24 @@ dht_notify (xlator_t *this, int event, void *data, ...) return ret; } + +int +dht_inode_ctx_layout_get (inode_t *inode, xlator_t *this, dht_layout_t **layout) +{ + dht_inode_ctx_t *ctx = NULL; + int ret = -1; + + ret = dht_inode_ctx_get (inode, this, &ctx); + + if (!ret && ctx) { + if (ctx->layout) { + if (layout) + *layout = ctx->layout; + ret = 0; + } else { + ret = -1; + } + } + + return ret; +} diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 3545c0f99..5ccd66799 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -22,6 +13,8 @@ #include "config.h" #endif +#include <regex.h> + #include "dht-mem-types.h" #include "libxlator.h" #include "syncop.h" @@ -29,7 +22,7 @@ #ifndef _DHT_H #define _DHT_H -#define GF_XATTR_FIX_LAYOUT_KEY "trusted.distribute.fix.layout" +#define GF_XATTR_FIX_LAYOUT_KEY "distribute.fix.layout" #define GF_DHT_LOOKUP_UNHASHED_ON 1 #define GF_DHT_LOOKUP_UNHASHED_AUTO 2 #define DHT_PATHINFO_HEADER "DISTRIBUTE:" @@ -38,7 +31,8 @@ typedef int (*dht_selfheal_dir_cbk_t) (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno); + int32_t op_ret, int32_t op_errno, + dict_t *xdata); typedef int (*dht_defrag_cbk_fn_t) (xlator_t *this, call_frame_t *frame, int ret); @@ -61,20 +55,38 @@ struct dht_layout { uint32_t start; uint32_t stop; xlator_t *xlator; - } list[0]; + } list[]; }; typedef struct dht_layout dht_layout_t; +struct dht_stat_time { + uint32_t atime; + uint32_t atime_nsec; + uint32_t ctime; + uint32_t ctime_nsec; + uint32_t mtime; + uint32_t mtime_nsec; +}; + +typedef struct dht_stat_time dht_stat_time_t; + +struct dht_inode_ctx { + dht_layout_t *layout; + dht_stat_time_t time; +}; + +typedef struct dht_inode_ctx dht_inode_ctx_t; + typedef enum { DHT_HASH_TYPE_DM, + DHT_HASH_TYPE_DM_USER, } dht_hashfn_type_t; /* rebalance related */ struct dht_rebalance_ { xlator_t *from_subvol; xlator_t *target_node; - int32_t wbflags; off_t offset; size_t size; int32_t flags; @@ -83,6 +95,7 @@ struct dht_rebalance_ { struct iovec *vector; struct iatt stbuf; dht_defrag_cbk_fn_t target_op_fn; + dict_t *xdata; }; struct dht_local { @@ -117,6 +130,7 @@ struct dht_local { int file_count; int dir_count; call_frame_t *main_frame; + int fop_succeeded; struct { fop_mknod_cbk_t linkfile_cbk; struct iatt stbuf; @@ -128,7 +142,6 @@ struct dht_local { struct { uint32_t hole_cnt; uint32_t overlaps_cnt; - uint32_t missing; uint32_t down; uint32_t misc; dht_selfheal_dir_cbk_t dir_cbk; @@ -141,11 +154,16 @@ struct dht_local { int32_t flags; mode_t mode; dev_t rdev; + mode_t umask; /* need for file-info */ - char *pathinfo; + char *xattr_val; char *key; + /* which xattr request? */ + char xsel[256]; + int32_t alloc_len; + char *newpath; /* gfid related */ @@ -161,18 +179,77 @@ struct dht_local { glusterfs_fop_t fop; + gf_boolean_t linked; + xlator_t *link_subvol; + struct dht_rebalance_ rebalance; + xlator_t *first_up_subvol; + }; typedef struct dht_local dht_local_t; /* du - disk-usage */ struct dht_du { double avail_percent; + double avail_inodes; uint64_t avail_space; uint32_t log; }; typedef struct dht_du dht_du_t; +enum gf_defrag_type { + GF_DEFRAG_CMD_START = 1, + GF_DEFRAG_CMD_STOP = 1 + 1, + GF_DEFRAG_CMD_STATUS = 1 + 2, + GF_DEFRAG_CMD_START_LAYOUT_FIX = 1 + 3, + GF_DEFRAG_CMD_START_FORCE = 1 + 4, +}; +typedef enum gf_defrag_type gf_defrag_type; + +enum gf_defrag_status_t { + GF_DEFRAG_STATUS_NOT_STARTED, + GF_DEFRAG_STATUS_STARTED, + GF_DEFRAG_STATUS_STOPPED, + GF_DEFRAG_STATUS_COMPLETE, + GF_DEFRAG_STATUS_FAILED, + GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED, + GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED, + GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE, + GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED, +}; +typedef enum gf_defrag_status_t gf_defrag_status_t; + +typedef struct gf_defrag_pattern_list gf_defrag_pattern_list_t; + +struct gf_defrag_pattern_list { + char path_pattern[256]; + uint64_t size; + gf_defrag_pattern_list_t *next; +}; + +struct gf_defrag_info_ { + uint64_t total_files; + uint64_t total_data; + uint64_t num_files_lookedup; + uint64_t total_failures; + uint64_t skipped; + gf_lock_t lock; + int cmd; + pthread_t th; + gf_defrag_status_t defrag_status; + struct rpc_clnt *rpc; + uint32_t connected; + uint32_t is_exiting; + pid_t pid; + inode_t *root_inode; + uuid_t node_uuid; + struct timeval start_time; + gf_boolean_t stats; + gf_defrag_pattern_list_t *defrag_pattern; +}; + +typedef struct gf_defrag_info_ gf_defrag_info_t; + struct dht_conf { gf_lock_t subvolume_lock; int subvolume_cnt; @@ -184,7 +261,8 @@ struct dht_conf { gf_boolean_t search_unhashed; int gen; dht_du_t *du_stats; - uint64_t min_free_disk; + double min_free_disk; + double min_free_inodes; char disk_unit; int32_t refresh_interval; gf_boolean_t unhashed_sticky_bit; @@ -201,10 +279,28 @@ struct dht_conf { /* Will be a global flag to control the layout spread count */ uint32_t dir_spread_cnt; - struct syncenv *env; /* The env pointer to the rebalance synctask */ - /* to keep track of nodes which are decomissioned */ xlator_t **decommissioned_bricks; + int decommission_in_progress; + int decommission_subvols_cnt; + + /* defrag related */ + gf_defrag_info_t *defrag; + + /* Request to filter directory entries in readdir request */ + + gf_boolean_t readdir_optimize; + + /* Support regex-based name reinterpretation. */ + regex_t rsync_regex; + gf_boolean_t rsync_regex_valid; + regex_t extra_regex; + gf_boolean_t extra_regex_valid; + + /* Support variable xattr names. */ + char *xattr_name; + char *link_xattr_name; + char *wild_xattr_name; }; typedef struct dht_conf dht_conf_t; @@ -219,31 +315,45 @@ struct dht_disk_layout { }; typedef struct dht_disk_layout dht_disk_layout_t; -#define WIPE(statp) do { typeof(*statp) z = {0,}; if (statp) *statp = z; } while (0) +typedef enum { + GF_DHT_MIGRATE_DATA, + GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS, + GF_DHT_MIGRATE_HARDLINK, + GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS +} gf_dht_migrate_data_type_t; #define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT) -#define is_fs_root(loc) (strcmp (loc->path, "/") == 0) - -#define is_revalidate(loc) (inode_ctx_get (loc->inode, this, NULL) == 0) +#define is_revalidate(loc) (dht_inode_ctx_layout_get (loc->inode, this, NULL) == 0) #define is_last_call(cnt) (cnt == 0) #define DHT_MIGRATION_IN_PROGRESS 1 #define DHT_MIGRATION_COMPLETED 2 -#define DHT_LINKFILE_KEY "trusted.glusterfs.dht.linkto" #define DHT_LINKFILE_MODE (S_ISVTX) -#define check_is_linkfile(i,s,x) ( \ - ((st_mode_from_ia (s->ia_prot, s->ia_type) & ~S_IFMT) \ - == DHT_LINKFILE_MODE) && \ - dict_get (x, DHT_LINKFILE_KEY)) - -#define check_is_linkfile_wo_dict(i,s) ( \ - ((st_mode_from_ia (s->ia_prot, s->ia_type) & ~S_IFMT) \ - == DHT_LINKFILE_MODE)) - +#define check_is_linkfile(i,s,x,n) ( \ + ((st_mode_from_ia ((s)->ia_prot, (s)->ia_type) & ~S_IFMT) \ + == DHT_LINKFILE_MODE) && \ + dict_get (x, n)) + +#define IS_DHT_MIGRATION_PHASE2(buf) ( \ + IA_ISREG ((buf)->ia_type) && \ + ((st_mode_from_ia ((buf)->ia_prot, (buf)->ia_type) & \ + ~S_IFMT) == DHT_LINKFILE_MODE)) + +#define IS_DHT_MIGRATION_PHASE1(buf) ( \ + IA_ISREG ((buf)->ia_type) && \ + ((buf)->ia_prot.sticky == 1) && \ + ((buf)->ia_prot.sgid == 1)) + +#define DHT_STRIP_PHASE1_FLAGS(buf) do { \ + if ((buf) && IS_DHT_MIGRATION_PHASE1(buf)) { \ + (buf)->ia_prot.sticky = 0; \ + (buf)->ia_prot.sgid = 0; \ + } \ + } while (0) #define check_is_dir(i,s,x) (IA_ISDIR(s->ia_type)) @@ -271,6 +381,25 @@ typedef struct dht_disk_layout dht_disk_layout_t; dht_local_wipe (__xl, __local); \ } while (0) +#define DHT_UPDATE_TIME(ctx_sec, ctx_nsec, new_sec, new_nsec, inode, post) do {\ + int32_t sec = 0; \ + sec = new_sec; \ + LOCK (&inode->lock); \ + { \ + new_sec = max(new_sec, ctx_sec); \ + if (sec < new_sec) \ + new_nsec = ctx_nsec; \ + if (sec == new_sec) \ + new_nsec = max (new_nsec, ctx_nsec); \ + if (post) { \ + ctx_sec = new_sec; \ + ctx_nsec = new_nsec; \ + } \ + } \ + UNLOCK (&inode->lock); \ + } while (0) + +#define is_greater_time(a, an, b, bn) (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))) dht_layout_t *dht_layout_new (xlator_t *this, int cnt); dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode); dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol); @@ -280,7 +409,7 @@ int dht_layout_normalize (xlator_t *this, l int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, uint32_t *holes_p, uint32_t *overlaps_p, uint32_t *missing_p, uint32_t *down_p, - uint32_t *misc_p); + uint32_t *misc_p, uint32_t *no_space_p); int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, loc_t *loc, dict_t *xattr); @@ -296,7 +425,7 @@ int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, int pos, int32_t **disk_layout_p); int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, - int pos, void *disk_layout_raw); + int pos, void *disk_layout_raw, int disk_layout_len); int dht_frame_return (call_frame_t *frame); @@ -314,12 +443,14 @@ int dht_iatt_merge (xlator_t *this, struct iatt *to, struct iatt xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc); xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode); xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev); +xlator_t *dht_subvol_next_available (xlator_t *this, xlator_t *prev); int dht_subvol_cnt (xlator_t *this, xlator_t *subvol); -int dht_hash_compute (int type, const char *name, uint32_t *hash_p); +int dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p); int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, - xlator_t *tovol, xlator_t *fromvol, loc_t *loc); + xlator_t *this, xlator_t *tovol, + xlator_t *fromvol, loc_t *loc); int dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc); int dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc); int @@ -334,17 +465,15 @@ dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, int dht_layout_sort_volname (dht_layout_t *layout); -int dht_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc); - int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc); -int dht_is_subvol_filled (xlator_t *this, xlator_t *subvol); -xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol); +gf_boolean_t dht_is_subvol_filled (xlator_t *this, xlator_t *subvol); +xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, + dht_local_t *layout); int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx); int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode); -int dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout); +int dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout);; void dht_layout_unref (xlator_t *this, dht_layout_t *layout); dht_layout_t *dht_layout_ref (xlator_t *this, dht_layout_t *layout); xlator_t *dht_first_up_subvol (xlator_t *this); @@ -359,7 +488,8 @@ int dht_rename_cleanup (call_frame_t *frame) int dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent); + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata); int dht_fix_directory_layout (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, @@ -382,73 +512,73 @@ int32_t dht_lookup (call_frame_t *frame, int32_t dht_stat (call_frame_t *frame, xlator_t *this, - loc_t *loc); + loc_t *loc, dict_t *xdata); int32_t dht_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd); + fd_t *fd, dict_t *xdata); int32_t dht_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, - off_t offset); + off_t offset, dict_t *xdata); int32_t dht_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, - off_t offset); + off_t offset, dict_t *xdata); int32_t dht_access (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t mask); + int32_t mask, dict_t *xdata); int32_t dht_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, - size_t size); + size_t size, dict_t *xdata); -int32_t dht_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, dict_t *params); +int32_t dht_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, + mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata); int32_t dht_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *params); + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata); int32_t dht_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc); + loc_t *loc, int xflag, dict_t *xdata); int32_t dht_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags); + loc_t *loc, int flags, dict_t *xdata); int32_t dht_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *loc, dict_t *params); + const char *linkpath, loc_t *loc, mode_t umask, + dict_t *xdata); int32_t dht_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc); + loc_t *newloc, dict_t *xdata); int32_t dht_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc); + loc_t *newloc, dict_t *xdata); int32_t dht_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params); + mode_t umask, fd_t *fd, dict_t *params); int32_t dht_open (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flags, fd_t *fd, - int32_t wbflags); + int32_t flags, fd_t *fd, dict_t *xdata); int32_t dht_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset); + off_t offset, uint32_t flags, dict_t *xdata); int32_t dht_writev (call_frame_t *frame, xlator_t *this, @@ -456,107 +586,121 @@ int32_t dht_writev (call_frame_t *frame, struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref); + uint32_t flags, + struct iobref *iobref, dict_t *xdata); int32_t dht_flush (call_frame_t *frame, xlator_t *this, - fd_t *fd); + fd_t *fd, dict_t *xdata); int32_t dht_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync); + int32_t datasync, dict_t *xdata); int32_t dht_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd); + loc_t *loc, fd_t *fd, dict_t *xdata); int32_t dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync); + int32_t datasync, dict_t *xdata); int32_t dht_statfs (call_frame_t *frame, xlator_t *this, - loc_t *loc); + loc_t *loc, dict_t *xdata); int32_t dht_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, - int32_t flags); + int32_t flags, dict_t *xdata); int32_t dht_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name); + const char *name, dict_t *xdata); int32_t dht_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, - int32_t flags); + int32_t flags, dict_t *xdata); int32_t dht_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - const char *name); + const char *name, dict_t *xdata); int32_t dht_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name); + const char *name, dict_t *xdata); +int32_t dht_fremovexattr (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + const char *name, dict_t *xdata); int32_t dht_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, - struct gf_flock *flock); + struct gf_flock *flock, dict_t *xdata); int32_t dht_inodelk (call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, int32_t cmd, - struct gf_flock *flock); + struct gf_flock *flock, dict_t *xdata); int32_t dht_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, int32_t cmd, - struct gf_flock *flock); + struct gf_flock *flock, dict_t *xdata); int32_t dht_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type); + entrylk_cmd cmd, entrylk_type type, dict_t *xdata); int32_t dht_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type); + entrylk_cmd cmd, entrylk_type type, dict_t *xdata); int32_t dht_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t off); + size_t size, off_t off, dict_t *xdata); int32_t dht_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t off); + size_t size, off_t off, dict_t *dict); int32_t dht_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, gf_xattrop_flags_t flags, - dict_t *dict); + dict_t *dict, dict_t *xdata); int32_t dht_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, gf_xattrop_flags_t flags, - dict_t *dict); + dict_t *dict, dict_t *xdata); int32_t dht_forget (xlator_t *this, inode_t *inode); int32_t dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid); + struct iatt *stbuf, int32_t valid, dict_t *xdata); int32_t dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid); - + struct iatt *stbuf, int32_t valid, dict_t *xdata); +int32_t dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t mode, off_t offset, size_t len, dict_t *xdata); +int32_t dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, size_t len, dict_t *xdata); +int32_t dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, size_t len, dict_t *xdata); + +int32_t dht_init (xlator_t *this); +void dht_fini (xlator_t *this); +int dht_reconfigure (xlator_t *this, dict_t *options); int32_t dht_notify (xlator_t *this, int32_t event, void *data, ...); /* definitions for nufa/switch */ @@ -579,12 +723,65 @@ int dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent); + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata); int dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent); + struct iatt *postparent, dict_t *xdata); + +int +gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict); + +int +gf_defrag_stop (gf_defrag_info_t *defrag, dict_t *output); +void* +gf_defrag_start (void *this); +int32_t +gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs, + struct iatt *stbuf); +int +dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, + int flag); +int +dht_inode_ctx_layout_get (inode_t *inode, xlator_t *this, + dht_layout_t **layout_int); +int +dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this, + dht_layout_t* layout_int); +int +dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat, + int32_t update_ctx); + +int dht_inode_ctx_get (inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx); +int dht_inode_ctx_set (inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx); +int +dht_dir_attr_heal (void *data); +int +dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data); +int +dht_dir_has_layout (dict_t *xattr, char *name); +gf_boolean_t +dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator); +xlator_t * +dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol, + dht_layout_t *layout); +xlator_t * +dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, + dht_layout_t *layout); +int +dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this); + +void +dht_layout_dump (dht_layout_t *layout, const char *prefix); +int32_t +dht_priv_dump (xlator_t *this); +int32_t +dht_inodectx_dump (xlator_t *this, inode_t *inode); + +int +dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol); #endif/* _DHT_H */ diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index d27d8bf91..fe3955ecb 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -35,227 +26,389 @@ int dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct statvfs *statvfs) + int op_ret, int op_errno, struct statvfs *statvfs, + dict_t *xdata) { - dht_conf_t *conf = NULL; - call_frame_t *prev = NULL; - int this_call_cnt = 0; - int i = 0; - double percent = 0; - uint64_t bytes = 0; - - conf = this->private; - prev = cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to get disk info from %s", prev->this->name); - goto out; - } - - if (statvfs && statvfs->f_blocks) { - percent = (statvfs->f_bavail * 100) / statvfs->f_blocks; - bytes = (statvfs->f_bavail * statvfs->f_frsize); - } - - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) - if (prev->this == conf->subvolumes[i]) { - conf->du_stats[i].avail_percent = percent; - conf->du_stats[i].avail_space = bytes; - gf_log (this->name, GF_LOG_DEBUG, - "on subvolume '%s': avail_percent is: " - "%.2f and avail_space is: %"PRIu64"", - prev->this->name, - conf->du_stats[i].avail_percent, - conf->du_stats[i].avail_space); - } - } - UNLOCK (&conf->subvolume_lock); + dht_conf_t *conf = NULL; + call_frame_t *prev = NULL; + int this_call_cnt = 0; + int i = 0; + double percent = 0; + double percent_inodes = 0; + uint64_t bytes = 0; + + conf = this->private; + prev = cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "failed to get disk info from %s", prev->this->name); + goto out; + } + + if (statvfs && statvfs->f_blocks) { + percent = (statvfs->f_bavail * 100) / statvfs->f_blocks; + bytes = (statvfs->f_bavail * statvfs->f_frsize); + } + + if (statvfs && statvfs->f_files) { + percent_inodes = (statvfs->f_ffree * 100) / statvfs->f_files; + } else { + /* set percent inodes to 100 for dynamically allocated inode filesystems + this logic holds good so that, distribute has nothing to worry about + total inodes rather let the 'create()' to be scheduled on the hashed + subvol regardless of the total inodes. since we have no awareness on + loosing inodes this logic fits well + */ + percent_inodes = 100; + } + + LOCK (&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) + if (prev->this == conf->subvolumes[i]) { + conf->du_stats[i].avail_percent = percent; + conf->du_stats[i].avail_space = bytes; + conf->du_stats[i].avail_inodes = percent_inodes; + gf_log (this->name, GF_LOG_DEBUG, + "on subvolume '%s': avail_percent is: " + "%.2f and avail_space is: %"PRIu64" " + "and avail_inodes is: %.2f", + prev->this->name, + conf->du_stats[i].avail_percent, + conf->du_stats[i].avail_space, + conf->du_stats[i].avail_inodes); + } + } + UNLOCK (&conf->subvolume_lock); out: - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_DESTROY (frame); + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_DESTROY (frame); - return 0; + return 0; } int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx) { - dht_conf_t *conf = NULL; - call_frame_t *statfs_frame = NULL; - dht_local_t *statfs_local = NULL; - call_pool_t *pool = NULL; - - conf = this->private; - pool = this->ctx->pool; - - statfs_frame = create_frame (this, pool); - if (!statfs_frame) { - goto err; - } - - /* local->fop value is not used in this case */ - statfs_local = dht_local_init (statfs_frame, NULL, NULL, - GF_FOP_MAXVALUE); - if (!statfs_local) { - goto err; - } - - loc_t tmp_loc = { .inode = NULL, - .path = "/", - }; - - statfs_local->call_cnt = 1; - STACK_WIND (statfs_frame, dht_du_info_cbk, - conf->subvolumes[subvol_idx], - conf->subvolumes[subvol_idx]->fops->statfs, - &tmp_loc); - - return 0; + dht_conf_t *conf = NULL; + call_frame_t *statfs_frame = NULL; + dht_local_t *statfs_local = NULL; + call_pool_t *pool = NULL; + loc_t tmp_loc = {0,}; + + conf = this->private; + pool = this->ctx->pool; + + statfs_frame = create_frame (this, pool); + if (!statfs_frame) { + goto err; + } + + /* local->fop value is not used in this case */ + statfs_local = dht_local_init (statfs_frame, NULL, NULL, + GF_FOP_MAXVALUE); + if (!statfs_local) { + goto err; + } + + /* make it root gfid, should be enough to get the proper info back */ + tmp_loc.gfid[15] = 1; + + statfs_local->call_cnt = 1; + STACK_WIND (statfs_frame, dht_du_info_cbk, + conf->subvolumes[subvol_idx], + conf->subvolumes[subvol_idx]->fops->statfs, + &tmp_loc, NULL); + + return 0; err: - if (statfs_frame) - DHT_STACK_DESTROY (statfs_frame); + if (statfs_frame) + DHT_STACK_DESTROY (statfs_frame); - return -1; + return -1; } int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc) { - int i = 0; - dht_conf_t *conf = NULL; - call_frame_t *statfs_frame = NULL; - dht_local_t *statfs_local = NULL; - struct timeval tv = {0,}; + int i = 0; + dht_conf_t *conf = NULL; + call_frame_t *statfs_frame = NULL; + dht_local_t *statfs_local = NULL; + struct timeval tv = {0,}; + loc_t tmp_loc = {0,}; + + conf = this->private; + + gettimeofday (&tv, NULL); + + /* make it root gfid, should be enough to get the proper + info back */ + tmp_loc.gfid[15] = 1; + + if (tv.tv_sec > (conf->refresh_interval + + conf->last_stat_fetch.tv_sec)) { + + statfs_frame = copy_frame (frame); + if (!statfs_frame) { + goto err; + } + + /* In this case, 'local->fop' is not used */ + statfs_local = dht_local_init (statfs_frame, loc, NULL, + GF_FOP_MAXVALUE); + if (!statfs_local) { + goto err; + } + + statfs_local->call_cnt = conf->subvolume_cnt; + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (statfs_frame, dht_du_info_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->statfs, + &tmp_loc, NULL); + } + + conf->last_stat_fetch.tv_sec = tv.tv_sec; + } + return 0; +err: + if (statfs_frame) + DHT_STACK_DESTROY (statfs_frame); - conf = this->private; + return -1; +} - gettimeofday (&tv, NULL); - if (tv.tv_sec > (conf->refresh_interval - + conf->last_stat_fetch.tv_sec)) { - statfs_frame = copy_frame (frame); - if (!statfs_frame) { - goto err; - } +gf_boolean_t +dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) +{ + int i = 0; + dht_conf_t *conf = NULL; + gf_boolean_t subvol_filled_inodes = _gf_false; + gf_boolean_t subvol_filled_space = _gf_false; + gf_boolean_t is_subvol_filled = _gf_false; + + conf = this->private; + + /* Check for values above specified percent or free disk */ + LOCK (&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + if (conf->disk_unit == 'p') { + if (conf->du_stats[i].avail_percent < + conf->min_free_disk) { + subvol_filled_space = _gf_true; + break; + } + + } else { + if (conf->du_stats[i].avail_space < + conf->min_free_disk) { + subvol_filled_space = _gf_true; + break; + } + } + if (conf->du_stats[i].avail_inodes < + conf->min_free_inodes) { + subvol_filled_inodes = _gf_true; + break; + } + } + } + } + UNLOCK (&conf->subvolume_lock); + + if (subvol_filled_space && conf->subvolume_status[i]) { + if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) { + gf_log (this->name, GF_LOG_WARNING, + "disk space on subvolume '%s' is getting " + "full (%.2f %%), consider adding more nodes", + subvol->name, + (100 - conf->du_stats[i].avail_percent)); + } + } + + if (subvol_filled_inodes && conf->subvolume_status[i]) { + if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) { + gf_log (this->name, GF_LOG_CRITICAL, + "inodes on subvolume '%s' are at " + "(%.2f %%), consider adding more nodes", + subvol->name, + (100 - conf->du_stats[i].avail_inodes)); + } + } + + is_subvol_filled = (subvol_filled_space || subvol_filled_inodes); + + return is_subvol_filled; +} - /* In this case, 'local->fop' is not used */ - statfs_local = dht_local_init (statfs_frame, loc, NULL, - GF_FOP_MAXVALUE); - if (!statfs_local) { - goto err; - } - loc_t tmp_loc = { .inode = NULL, - .path = "/", - }; +/*Get the best subvolume to create the file in*/ +xlator_t * +dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, + dht_local_t *local) +{ + xlator_t *avail_subvol = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + loc_t *loc = NULL; - statfs_local->call_cnt = conf->subvolume_cnt; - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (statfs_frame, dht_du_info_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->statfs, - &tmp_loc); + conf = this->private; + if (!local) + goto out; + loc = &local->loc; + if (!local->layout) { + layout = dht_layout_get (this, loc->parent); + + if (!layout) { + gf_log (this->name, GF_LOG_DEBUG, + "layout missing path=%s parent=%s", + loc->path, uuid_utoa (loc->parent->gfid)); + goto out; } - - conf->last_stat_fetch.tv_sec = tv.tv_sec; + } else { + layout = dht_layout_ref (this, local->layout); } - return 0; -err: - if (statfs_frame) - DHT_STACK_DESTROY (statfs_frame); - return -1; + LOCK (&conf->subvolume_lock); + { + avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, + layout); + if(!avail_subvol) + { + avail_subvol = dht_subvol_maxspace_nonzeroinode(this, + subvol, + layout); + } + + } + UNLOCK (&conf->subvolume_lock); +out: + if (!avail_subvol) { + gf_log (this->name, + GF_LOG_DEBUG, + "no subvolume has enough free space and/or inodes\ + to create"); + avail_subvol = subvol; + } + + if (layout) + dht_layout_unref (this, layout); + return avail_subvol; } +static inline +int32_t dht_subvol_has_err (xlator_t *this, dht_layout_t *layout) +{ + int ret = -1; + int i = 0; -int -dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) + if (!this || !layout) + goto out; + + /* check if subvol has layout errors, before selecting it */ + for (i = 0; i < layout->cnt; i++) { + if (!strcmp (layout->list[i].xlator->name, this->name) && + (layout->list[i].err != 0)) { + ret = -1; + goto out; + } + } + ret = 0; +out: + return ret; +} + +/*Get subvolume which has both space and inodes more than the min criteria*/ +xlator_t * +dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, + dht_layout_t *layout) { - int i = 0; - int subvol_filled = 0; + int i = 0; + double max = 0; + double max_inodes = 0; + int ignore_subvol = 0; + + xlator_t *avail_subvol = NULL; dht_conf_t *conf = NULL; conf = this->private; - /* Check for values above specified percent or free disk */ - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (subvol == conf->subvolumes[i]) { - if (conf->disk_unit == 'p') { - if (conf->du_stats[i].avail_percent < - conf->min_free_disk) { - subvol_filled = 1; - break; - } - } else { - if (conf->du_stats[i].avail_space < - conf->min_free_disk) { - subvol_filled = 1; - break; - } - } + for(i=0; i < conf->subvolume_cnt; i++) { + /* check if subvol has layout errors, before selecting it */ + ignore_subvol = dht_subvol_has_err (conf->subvolumes[i], + layout); + if (ignore_subvol) + continue; + + if ((conf->disk_unit == 'p') && + (conf->du_stats[i].avail_percent > conf->min_free_disk) && + (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) { + if ((conf->du_stats[i].avail_inodes > max_inodes) || + (conf->du_stats[i].avail_percent > max)) { + max = conf->du_stats[i].avail_percent; + max_inodes = conf->du_stats[i].avail_inodes; + avail_subvol = conf->subvolumes[i]; } } - } - UNLOCK (&conf->subvolume_lock); - - if (subvol_filled && conf->subvolume_status[i]) { - if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) { - gf_log (this->name, GF_LOG_WARNING, - "disk space on subvolume '%s' is getting " - "full (%.2f %%), consider adding more nodes", - subvol->name, - (100 - conf->du_stats[i].avail_percent)); + + if ((conf->disk_unit != 'p') && + (conf->du_stats[i].avail_space > conf->min_free_disk) && + (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) { + if ((conf->du_stats[i].avail_inodes > max_inodes) || + (conf->du_stats[i].avail_space > max)) { + max = conf->du_stats[i].avail_space; + max_inodes = conf->du_stats[i].avail_inodes; + avail_subvol = conf->subvolumes[i]; + } } } - return subvol_filled; + return avail_subvol; } + +/* Get subvol which has atleast one inode and maximum space */ xlator_t * -dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol) +dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, + dht_layout_t *layout) { int i = 0; - double max= 0; + double max = 0; + int ignore_subvol = 0; + xlator_t *avail_subvol = NULL; dht_conf_t *conf = NULL; conf = this->private; - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->disk_unit == 'p') { - if (conf->du_stats[i].avail_percent > max) { - max = conf->du_stats[i].avail_percent; - avail_subvol = conf->subvolumes[i]; - } - } else { - if (conf->du_stats[i].avail_space > max) { - max = conf->du_stats[i].avail_space; - avail_subvol = conf->subvolumes[i]; - } + for (i = 0; i < conf->subvolume_cnt; i++) { + /* check if subvol has layout errors, before selecting it */ + ignore_subvol = dht_subvol_has_err (conf->subvolumes[i], + layout); + if (ignore_subvol) + continue; + + if (conf->disk_unit == 'p') { + if ((conf->du_stats[i].avail_percent > max) + && (conf->du_stats[i].avail_inodes > 0 )) { + max = conf->du_stats[i].avail_percent; + avail_subvol = conf->subvolumes[i]; } - } - } - UNLOCK (&conf->subvolume_lock); - - if (!avail_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume has enough free space to create"); + } else { + if ((conf->du_stats[i].avail_space > max) + && (conf->du_stats[i].avail_inodes > 0)) { + max = conf->du_stats[i].avail_space; + avail_subvol = conf->subvolumes[i]; + } + } } - if (max < conf->min_free_disk) - avail_subvol = subvol; - - if (!avail_subvol) - avail_subvol = subvol; - return avail_subvol; } diff --git a/xlators/cluster/dht/src/dht-hashfn.c b/xlators/cluster/dht/src/dht-hashfn.c index c8ae74172..656cf23a0 100644 --- a/xlators/cluster/dht/src/dht-hashfn.c +++ b/xlators/cluster/dht/src/dht-hashfn.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -37,6 +28,7 @@ dht_hash_compute_internal (int type, const char *name, uint32_t *hash_p) switch (type) { case DHT_HASH_TYPE_DM: + case DHT_HASH_TYPE_DM_USER: hash = gf_dm_hashfn (name, strlen (name)); break; default: @@ -52,30 +44,68 @@ dht_hash_compute_internal (int type, const char *name, uint32_t *hash_p) } -#define MAKE_RSYNC_FRIENDLY_NAME(rsync_frndly_name, name) do { \ - rsync_frndly_name = (char *) name; \ - if (name[0] == '.') { \ - char *dot = 0; \ - int namelen = 0; \ - \ - dot = strrchr (name, '.'); \ - if (dot && dot > (name + 1) && *(dot + 1)) { \ - namelen = (dot - name); \ - rsync_frndly_name = alloca (namelen); \ - strncpy (rsync_frndly_name, name + 1, \ - namelen); \ - rsync_frndly_name[namelen - 1] = 0; \ - } \ - } \ - } while (0); +static inline +gf_boolean_t +dht_munge_name (const char *original, char *modified, size_t len, regex_t *re) +{ + regmatch_t matches[2]; + size_t new_len; + + if (regexec(re,original,2,matches,0) != REG_NOMATCH) { + if (matches[1].rm_so != -1) { + new_len = matches[1].rm_eo - matches[1].rm_so; + /* Equal would fail due to the NUL at the end. */ + if (new_len < len) { + memcpy (modified,original+matches[1].rm_so, + new_len); + modified[new_len] = '\0'; + return _gf_true; + } + } + } + /* This is guaranteed safe because of how the dest was allocated. */ + strcpy(modified,original); + return _gf_false; +} int -dht_hash_compute (int type, const char *name, uint32_t *hash_p) +dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p) { - char *rsync_friendly_name = NULL; + char *rsync_friendly_name = NULL; + dht_conf_t *priv = this->private; + size_t len = 0; + gf_boolean_t munged = _gf_false; + + /* + * It wouldn't be safe to use alloca in an inline function that doesn't + * actually get inlined, and it wouldn't be efficient to do a real + * allocation, so we use alloca here (if needed) and pass that to the + * inline. + */ + + if (priv->extra_regex_valid) { + len = strlen(name) + 1; + rsync_friendly_name = alloca(len); + munged = dht_munge_name (name, rsync_friendly_name, len, + &priv->extra_regex); + } + + if (!munged && priv->rsync_regex_valid) { + len = strlen(name) + 1; + rsync_friendly_name = alloca(len); + gf_log (this->name, GF_LOG_TRACE, "trying regex for %s", name); + munged = dht_munge_name (name, rsync_friendly_name, len, + &priv->rsync_regex); + if (munged) { + gf_log (this->name, GF_LOG_DEBUG, + "munged down to %s", rsync_friendly_name); + } + } - MAKE_RSYNC_FRIENDLY_NAME (rsync_friendly_name, name); + if (!munged) { + rsync_friendly_name = (char *)name; + } return dht_hash_compute_internal (type, rsync_friendly_name, hash_p); } diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c index d8138067e..311a48112 100644 --- a/xlators/cluster/dht/src/dht-helper.c +++ b/xlators/cluster/dht/src/dht-helper.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -27,6 +18,28 @@ #include "xlator.h" #include "dht-common.h" +static inline int +dht_inode_ctx_set1 (xlator_t *this, inode_t *inode, xlator_t *subvol) +{ + uint64_t tmp_subvol = 0; + + tmp_subvol = (long)subvol; + return inode_ctx_set1 (inode, this, &tmp_subvol); +} + +int +dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol) +{ + int ret = -1; + uint64_t tmp_subvol = 0; + + ret = inode_ctx_get1 (inode, this, &tmp_subvol); + if (tmp_subvol && subvol) + *subvol = (xlator_t *)tmp_subvol; + + return ret; +} + int dht_frame_return (call_frame_t *frame) @@ -49,6 +62,43 @@ dht_frame_return (call_frame_t *frame) } +static uint64_t +dht_bits_for (uint64_t num) +{ + uint64_t bits = 0, ctrl = 1; + + while (ctrl < num) { + ctrl *= 2; + bits ++; + } + + return bits; +} + +/* + * A slightly "updated" version of the algorithm described in the commit log + * is used here. + * + * The only enhancement is that: + * + * - The number of bits used by the backend filesystem for HUGE d_off which + * is described as 63, and + * - The number of bits used by the d_off presented by the transformation + * upwards which is described as 64, are both made "configurable." + */ + + +#define BACKEND_D_OFF_BITS 63 +#define PRESENT_D_OFF_BITS 63 + +#define ONE 1ULL +#define MASK (~0ULL) +#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS)) +#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS)) + +#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1)) +#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1))) + int dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) { @@ -56,6 +106,9 @@ dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) int cnt = 0; int max = 0; uint64_t y = 0; + uint64_t hi_mask = 0; + uint64_t off_mask = 0; + int max_bits = 0; if (x == ((uint64_t) -1)) { y = (uint64_t) -1; @@ -69,7 +122,23 @@ dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) max = conf->subvolume_cnt; cnt = dht_subvol_cnt (this, subvol); - y = ((x * max) + cnt); + if (max == 1) { + y = x; + goto out; + } + + max_bits = dht_bits_for (max); + + hi_mask = ~(PRESENT_MASK >> (max_bits + 1)); + + if (x & hi_mask) { + /* HUGE d_off */ + off_mask = MASK << max_bits; + y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt; + } else { + /* small d_off */ + y = ((x * max) + cnt); + } out: if (y_p) @@ -89,7 +158,7 @@ dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc, int ret = 0; /* not found */ /* Why do other tasks if first required 'char' itself is not there */ - if (loc->name && !strchr (loc->name, '@')) + if (!new_loc || !loc || !loc->name || !strchr (loc->name, '@')) goto out; trav = this->children; @@ -117,7 +186,6 @@ dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc, new_loc->path = ((new_path) ? new_path: gf_strdup (loc->path)); new_loc->name = new_name; - new_loc->ino = loc->ino; new_loc->inode = inode_ref (loc->inode); new_loc->parent = inode_ref (loc->parent); } @@ -130,10 +198,8 @@ dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc, out: if (!ret) { /* !success */ - if (new_path) - GF_FREE (new_path); - if (new_name) - GF_FREE (new_name); + GF_FREE (new_path); + GF_FREE (new_name); } return ret; } @@ -147,16 +213,38 @@ dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p, int max = 0; uint64_t x = 0; xlator_t *subvol = 0; + int max_bits = 0; + uint64_t off_mask = 0; + uint64_t host_mask = 0; if (!this->private) - goto out; + return -1; conf = this->private; max = conf->subvolume_cnt; - cnt = y % max; - x = y / max; + if (max == 1) { + x = y; + cnt = 0; + goto out; + } + + if (y & TOP_BIT) { + /* HUGE d_off */ + max_bits = dht_bits_for (max); + off_mask = (MASK << max_bits); + host_mask = ~(off_mask); + + x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS; + + cnt = y & host_mask; + } else { + /* small d_off */ + cnt = y % max; + x = y / max; + } +out: subvol = conf->subvolumes[cnt]; if (subvol_p) @@ -165,7 +253,6 @@ dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p, if (x_p) *x_p = x; -out: return 0; } @@ -216,21 +303,16 @@ dht_local_wipe (xlator_t *this, dht_local_t *local) local->selfheal.layout = NULL; } - if (local->newpath) { - GF_FREE (local->newpath); - } + GF_FREE (local->newpath); - if (local->key) { - GF_FREE (local->key); - } + GF_FREE (local->key); - if (local->rebalance.vector) - GF_FREE (local->rebalance.vector); + GF_FREE (local->rebalance.vector); if (local->rebalance.iobref) iobref_unref (local->rebalance.iobref); - GF_FREE (local); + mem_put (local); } @@ -241,8 +323,7 @@ dht_local_init (call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop) inode_t *inode = NULL; int ret = 0; - /* TODO: use mem-pool */ - local = GF_CALLOC (1, sizeof (*local), gf_dht_mt_dht_local_t); + local = mem_get0 (THIS->local_pool); if (!local) goto out; @@ -275,26 +356,12 @@ dht_local_init (call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop) out: if (ret) { if (local) - GF_FREE (local); + mem_put (local); local = NULL; } return local; } - -char * -basestr (const char *str) -{ - char *basestr = NULL; - - basestr = strrchr (str, '/'); - if (basestr) - basestr ++; - - return basestr; -} - - xlator_t * dht_first_up_subvol (xlator_t *this) { @@ -359,17 +426,23 @@ dht_subvol_get_hashed (xlator_t *this, loc_t *loc) dht_layout_t *layout = NULL; xlator_t *subvol = NULL; - if (is_fs_root (loc)) { + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + + if (__is_root_gfid (loc->gfid)) { subvol = dht_first_up_subvol (this); goto out; } + GF_VALIDATE_OR_GOTO (this->name, loc->parent, out); + GF_VALIDATE_OR_GOTO (this->name, loc->name, out); + layout = dht_layout_get (this, loc->parent); if (!layout) { gf_log (this->name, GF_LOG_DEBUG, - "layout missing path=%s parent=%"PRId64, - loc->path, loc->parent->ino); + "layout missing path=%s parent=%s", + loc->path, uuid_utoa (loc->parent->gfid)); goto out; } @@ -397,6 +470,8 @@ dht_subvol_get_cached (xlator_t *this, inode_t *inode) dht_layout_t *layout = NULL; xlator_t *subvol = NULL; + GF_VALIDATE_OR_GOTO (this->name, this, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); layout = dht_layout_get (this, inode); @@ -438,7 +513,36 @@ out: return next; } +/* This func wraps around, if prev is actually the last subvol. + */ +xlator_t * +dht_subvol_next_available (xlator_t *this, xlator_t *prev) +{ + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *next = NULL; + + conf = this->private; + if (!conf) + goto out; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev) { + /* if prev is last in conf->subvolumes, then wrap + * around. + */ + if ((i + 1) < conf->subvolume_cnt) { + next = conf->subvolumes[i + 1]; + } else { + next = conf->subvolumes[0]; + } + break; + } + } + +out: + return next; +} int dht_subvol_cnt (xlator_t *this, xlator_t *subvol) { @@ -467,6 +571,15 @@ out: (a) = (b); \ } while (0) + +#define set_if_greater_time(a, an, b, bn) do { \ + if (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))){ \ + (a) = (b); \ + (an) = (bn); \ + } \ + } while (0) \ + + int dht_iatt_merge (xlator_t *this, struct iatt *to, struct iatt *from, xlator_t *subvol) @@ -490,9 +603,12 @@ dht_iatt_merge (xlator_t *this, struct iatt *to, set_if_greater (to->ia_uid, from->ia_uid); set_if_greater (to->ia_gid, from->ia_gid); - set_if_greater (to->ia_atime, from->ia_atime); - set_if_greater (to->ia_mtime, from->ia_mtime); - set_if_greater (to->ia_ctime, from->ia_ctime); + set_if_greater_time(to->ia_atime, to->ia_atime_nsec, + from->ia_atime, from->ia_atime_nsec); + set_if_greater_time (to->ia_mtime, to->ia_mtime_nsec, + from->ia_mtime, from->ia_mtime_nsec); + set_if_greater_time (to->ia_ctime, to->ia_ctime_nsec, + from->ia_ctime, from->ia_ctime_nsec); return 0; } @@ -618,20 +734,36 @@ dht_migration_complete_check_task (void *data) call_frame_t *frame = NULL; loc_t tmp_loc = {0,}; char *path = NULL; + dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + uint64_t tmp_subvol = 0; + int open_failed = 0; this = THIS; frame = data; local = frame->local; + conf = this->private; src_node = local->cached_subvol; - /* getxattr on cached_subvol for 'linkto' value */ - if (!local->loc.inode) + if (!local->loc.inode && !local->fd) + goto out; + + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr + * as root:root. If a fd is already open, access check wont be done*/ + + if (!local->loc.inode) { ret = syncop_fgetxattr (src_node, local->fd, &dict, - DHT_LINKFILE_KEY); - else + conf->link_xattr_name); + } else { + SYNCTASK_SETID (0, 0); ret = syncop_getxattr (src_node, &local->loc, &dict, - DHT_LINKFILE_KEY); + conf->link_xattr_name); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + } if (!ret) dst_node = dht_linkfile_subvol (this, NULL, NULL, dict); @@ -682,10 +814,7 @@ dht_migration_complete_check_task (void *data) /* update inode ctx (the layout) */ dht_layout_unref (this, local->layout); - if (!local->loc.inode) - ret = dht_layout_preset (this, dst_node, local->fd->inode); - else - ret = dht_layout_preset (this, dst_node, local->loc.inode); + ret = dht_layout_preset (this, dst_node, inode); if (ret != 0) { gf_log (this->name, GF_LOG_DEBUG, "%s: could not set preset layout for subvol %s", @@ -703,10 +832,7 @@ dht_migration_complete_check_task (void *data) goto out; } - if (!local->loc.inode) - ret = dht_layout_set (this, local->fd->inode, layout); - else - ret = dht_layout_set (this, local->loc.inode, layout); + ret = dht_layout_set (this, inode, layout); if (ret) { gf_log (this->name, GF_LOG_ERROR, "%s: failed to set the new layout", @@ -717,43 +843,46 @@ dht_migration_complete_check_task (void *data) local->cached_subvol = dst_node; ret = 0; - if (!local->fd) + /* once we detect the migration complete, the inode-ctx2 is no more + required.. delete the ctx and also, it means, open() already + done on all the fd of inode */ + ret = inode_ctx_reset1 (inode, this, &tmp_subvol); + if (tmp_subvol) goto out; - /* once we detect the migration complete, the fd-ctx is no more - required.. delete the ctx, and do one extra 'fd_unref' for open fd */ - ret = fd_ctx_del (local->fd, this, NULL); - if (!ret) { - fd_unref (local->fd); - ret = 0; + if (list_empty (&inode->fd_list)) goto out; - } - /* if 'local->fd' (ie, fd based operation), send a 'open()' on - destination if not already done */ - if (local->loc.inode) { - ret = syncop_open (dst_node, &local->loc, - local->fd->flags, local->fd); - } else { - tmp_loc.inode = local->fd->inode; - inode_path (local->fd->inode, NULL, &path); - if (path) - tmp_loc.path = path; - ret = syncop_open (dst_node, &tmp_loc, - local->fd->flags, local->fd); - if (path) - GF_FREE (path); + /* perform open as root:root. There is window between linkfile + * creation(root:root) and setattr with the correct uid/gid + */ + SYNCTASK_SETID(0, 0); + + /* perform 'open()' on all the fd's present on the inode */ + tmp_loc.inode = inode; + inode_path (inode, NULL, &path); + if (path) + tmp_loc.path = path; + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + if (fd_is_anonymous (iter_fd)) + continue; + ret = syncop_open (dst_node, &tmp_loc, + iter_fd->flags, iter_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "failed to open " + "the fd (%p, flags=0%o) on file %s @ %s", + iter_fd, iter_fd->flags, path, dst_node->name); + open_failed = 1; + } } - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to send open() on target file at %s", - local->loc.path, dst_node->name); + GF_FREE (path); + + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + if (open_failed) { + ret = -1; goto out; } - - /* need this unref for the fd on src_node */ - fd_unref (local->fd); ret = 0; out: @@ -764,11 +893,8 @@ int dht_rebalance_complete_check (xlator_t *this, call_frame_t *frame) { int ret = -1; - dht_conf_t *conf = NULL; - - conf = this->private; - ret = synctask_new (conf->env, dht_migration_complete_check_task, + ret = synctask_new (this->ctx->env, dht_migration_complete_check_task, dht_migration_complete_check_done, frame, frame); return ret; @@ -800,20 +926,34 @@ dht_rebalance_inprogress_task (void *data) char *path = NULL; struct iatt stbuf = {0,}; loc_t tmp_loc = {0,}; + dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + int open_failed = 0; this = THIS; frame = data; local = frame->local; + conf = this->private; src_node = local->cached_subvol; - /* getxattr on cached_subvol for 'linkto' value */ - if (local->loc.inode) + if (!local->loc.inode && !local->fd) + goto out; + + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr + * as root:root. If a fd is already open, access check wont be done*/ + if (local->loc.inode) { + SYNCTASK_SETID (0, 0); ret = syncop_getxattr (src_node, &local->loc, &dict, - DHT_LINKFILE_KEY); - else + conf->link_xattr_name); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + } else { ret = syncop_fgetxattr (src_node, local->fd, &dict, - DHT_LINKFILE_KEY); + conf->link_xattr_name); + } if (ret) { gf_log (this->name, GF_LOG_ERROR, @@ -855,34 +995,46 @@ dht_rebalance_inprogress_task (void *data) ret = 0; - if (!local->fd) - goto out; + if (list_empty (&inode->fd_list)) + goto done; + + /* perform open as root:root. There is window between linkfile + * creation(root:root) and setattr with the correct uid/gid + */ + SYNCTASK_SETID (0, 0); + + tmp_loc.inode = inode; + inode_path (inode, NULL, &path); + if (path) + tmp_loc.path = path; + + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + if (fd_is_anonymous (iter_fd)) + continue; - if (local->loc.inode) { - ret = syncop_open (dst_node, &local->loc, - local->fd->flags, local->fd); - } else { - tmp_loc.inode = local->fd->inode; - inode_path (local->fd->inode, NULL, &path); - if (path) - tmp_loc.path = path; ret = syncop_open (dst_node, &tmp_loc, - local->fd->flags, local->fd); - if (path) - GF_FREE (path); + iter_fd->flags, iter_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "failed to send open " + "the fd (%p, flags=0%o) on file %s @ %s", + iter_fd, iter_fd->flags, path, dst_node->name); + open_failed = 1; + } } + GF_FREE (path); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to send open() on target file at %s", - local->loc.path, dst_node->name); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + + if (open_failed) { + ret = -1; goto out; } - ret = fd_ctx_set (local->fd, this, (uint64_t)(long)dst_node); +done: + ret = dht_inode_ctx_set1 (this, inode, dst_node); if (ret) { gf_log (this->name, GF_LOG_ERROR, - "%s: failed to set fd-ctx target file at %s", + "%s: failed to set inode-ctx target file at %s", local->loc.path, dst_node->name); goto out; } @@ -897,12 +1049,99 @@ dht_rebalance_in_progress_check (xlator_t *this, call_frame_t *frame) { int ret = -1; - dht_conf_t *conf = NULL; - conf = this->private; - - ret = synctask_new (conf->env, dht_rebalance_inprogress_task, + ret = synctask_new (this->ctx->env, dht_rebalance_inprogress_task, dht_inprogress_check_done, frame, frame); return ret; } + +int +dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this, + dht_layout_t *layout_int) +{ + dht_inode_ctx_t *ctx = NULL; + int ret = -1; + + ret = dht_inode_ctx_get (inode, this, &ctx); + if (!ret && ctx) { + ctx->layout = layout_int; + } else { + ctx = GF_CALLOC (1, sizeof (*ctx), gf_dht_mt_inode_ctx_t); + if (!ctx) + return ret; + ctx->layout = layout_int; + } + + ret = dht_inode_ctx_set (inode, this, ctx); + + return ret; +} + +int +dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat, + int32_t post) +{ + dht_inode_ctx_t *ctx = NULL; + dht_stat_time_t *time = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO (this->name, stat, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + + ret = dht_inode_ctx_get (inode, this, &ctx); + + if (ret) { + ctx = GF_CALLOC (1, sizeof (*ctx), gf_dht_mt_inode_ctx_t); + if (!ctx) + return -1; + } + + time = &ctx->time; + + DHT_UPDATE_TIME(time->mtime, time->mtime_nsec, + stat->ia_mtime, stat->ia_mtime_nsec, inode, post); + DHT_UPDATE_TIME(time->ctime, time->ctime_nsec, + stat->ia_ctime, stat->ia_ctime_nsec, inode, post); + DHT_UPDATE_TIME(time->atime, time->atime_nsec, + stat->ia_atime, stat->ia_atime_nsec, inode, post); + + ret = dht_inode_ctx_set (inode, this, ctx); +out: + return 0; +} + +int +dht_inode_ctx_get (inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx) +{ + int ret = -1; + uint64_t ctx_int = 0; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + + ret = inode_ctx_get (inode, this, &ctx_int); + + if (ret) + return ret; + + if (ctx) + *ctx = (dht_inode_ctx_t *) ctx_int; +out: + return ret; +} + +int dht_inode_ctx_set (inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx) +{ + int ret = -1; + uint64_t ctx_int = 0; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO (this->name, ctx, out); + + ctx_int = (long)ctx; + ret = inode_ctx_set (inode, this, &ctx_int); +out: + return ret; +} diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c index 1e9f54bda..ece84151a 100644 --- a/xlators/cluster/dht/src/dht-inode-read.c +++ b/xlators/cluster/dht/src/dht-inode-read.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -34,7 +25,7 @@ int dht_fsync2 (xlator_t *this, call_frame_t *frame, int ret); int dht_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd) + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -61,7 +52,7 @@ dht_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, return 0; out: - DHT_STACK_UNWIND (open, frame, op_ret, op_errno, local->fd); + DHT_STACK_UNWIND (open, frame, op_ret, op_errno, local->fd, xdata); return 0; } @@ -86,17 +77,17 @@ dht_open2 (xlator_t *this, call_frame_t *frame, int op_ret) STACK_WIND (frame, dht_open_cbk, subvol, subvol->fops->open, &local->loc, local->rebalance.flags, local->fd, - local->rebalance.wbflags); + NULL); return 0; out: - DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); return 0; } int dht_open (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags, fd_t *fd, int wbflags) + loc_t *loc, int flags, fd_t *fd, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -120,30 +111,30 @@ dht_open (call_frame_t *frame, xlator_t *this, goto err; } - local->rebalance.wbflags = wbflags; local->rebalance.flags = flags; local->call_cnt = 1; STACK_WIND (frame, dht_open_cbk, subvol, subvol->fops->open, - loc, flags, fd, wbflags); + loc, flags, fd, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (open, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (open, frame, -1, op_errno, NULL, NULL); return 0; } int dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *stbuf) + int op_ret, int op_errno, struct iatt *stbuf, dict_t *xdata) { - uint64_t tmp_subvol = 0; + xlator_t *subvol = 0; dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + inode_t *inode = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -164,27 +155,28 @@ dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (local->call_cnt != 1) goto out; + local->op_errno = op_errno; /* Check if the rebalance phase2 is true */ - if ((op_ret == -1) || (IA_ISREG (stbuf->ia_type) && - ((st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type) & - ~S_IFMT) == DHT_LINKFILE_MODE))) { - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (ret) { + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) { + inode = (local->fd) ? local->fd->inode : local->loc.inode; + ret = dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { /* Phase 2 of migration */ local->rebalance.target_op_fn = dht_attr2; ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; } else { /* value is already set in fd_ctx, that means no need to check for whether its complete or not. */ dht_attr2 (this, frame, 0); - ret = 0; + return 0; } - if (!ret) - goto err; } + out: - DHT_STACK_UNWIND (stat, frame, op_ret, op_errno, stbuf); + DHT_STRIP_PHASE1_FLAGS (stbuf); + DHT_STACK_UNWIND (stat, frame, op_ret, op_errno, stbuf, xdata); err: return 0; } @@ -209,21 +201,21 @@ dht_attr2 (xlator_t *this, call_frame_t *frame, int op_ret) if (local->fop == GF_FOP_FSTAT) { STACK_WIND (frame, dht_file_attr_cbk, subvol, - subvol->fops->fstat, local->fd); + subvol->fops->fstat, local->fd, NULL); } else { STACK_WIND (frame, dht_file_attr_cbk, subvol, - subvol->fops->stat, &local->loc); + subvol->fops->stat, &local->loc, NULL); } return 0; out: - DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); return 0; } int dht_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *stbuf) + int op_ret, int op_errno, struct iatt *stbuf, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; @@ -258,20 +250,21 @@ out: this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) { DHT_STACK_UNWIND (stat, frame, local->op_ret, local->op_errno, - &local->stbuf); + &local->stbuf, xdata); } err: return 0; } int -dht_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) +dht_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; dht_local_t *local = NULL; dht_layout_t *layout = NULL; int i = 0; + int call_cnt = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -300,39 +293,40 @@ dht_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) subvol = local->cached_subvol; STACK_WIND (frame, dht_file_attr_cbk, subvol, - subvol->fops->stat, loc); + subvol->fops->stat, loc, xdata); return 0; } - local->call_cnt = layout->cnt; + local->call_cnt = call_cnt = layout->cnt; - for (i = 0; i < layout->cnt; i++) { + for (i = 0; i < call_cnt; i++) { subvol = layout->list[i].xlator; STACK_WIND (frame, dht_attr_cbk, subvol, subvol->fops->stat, - loc); + loc, xdata); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); return 0; } int -dht_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd) +dht_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; dht_local_t *local = NULL; dht_layout_t *layout = NULL; int i = 0; + int call_cnt = 0; VALIDATE_OR_GOTO (frame, err); @@ -359,25 +353,25 @@ dht_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd) subvol = local->cached_subvol; STACK_WIND (frame, dht_file_attr_cbk, subvol, - subvol->fops->fstat, fd); + subvol->fops->fstat, fd, xdata); return 0; } - local->call_cnt = layout->cnt; + local->call_cnt = call_cnt = layout->cnt; - for (i = 0; i < layout->cnt; i++) { + for (i = 0; i < call_cnt; i++) { subvol = layout->list[i].xlator; STACK_WIND (frame, dht_attr_cbk, subvol, subvol->fops->fstat, - fd); + fd, xdata); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fstat, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); return 0; } @@ -386,10 +380,12 @@ int dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iovec *vector, int count, struct iatt *stbuf, - struct iobref *iobref) + struct iobref *iobref, dict_t *xdata) { dht_local_t *local = NULL; int ret = 0; + inode_t *inode = NULL; + xlator_t *subvol = 0; local = frame->local; if (!local) { @@ -405,25 +401,27 @@ dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if ((op_ret == -1) && (op_errno != ENOENT)) goto out; - if ((op_ret == -1) || ((st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type) & - ~S_IFMT) == DHT_LINKFILE_MODE)) { + local->op_errno = op_errno; + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) { /* File would be migrated to other node */ - ret = fd_ctx_get (local->fd, this, NULL); - if (ret) { + ret = dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { local->rebalance.target_op_fn = dht_readv2; ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; } else { /* value is already set in fd_ctx, that means no need to check for whether its complete or not. */ dht_readv2 (this, frame, 0); - } - if (!ret) return 0; + } } out: + DHT_STRIP_PHASE1_FLAGS (stbuf); DHT_STACK_UNWIND (readv, frame, op_ret, op_errno, vector, count, stbuf, - iobref); + iobref, xdata); return 0; } @@ -447,18 +445,19 @@ dht_readv2 (xlator_t *this, call_frame_t *frame, int op_ret) subvol = local->cached_subvol; STACK_WIND (frame, dht_readv_cbk, subvol, subvol->fops->readv, - local->fd, local->rebalance.size, local->rebalance.offset); + local->fd, local->rebalance.size, local->rebalance.offset, + local->rebalance.flags, NULL); return 0; out: - DHT_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL); + DHT_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); return 0; } int dht_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off) + fd_t *fd, size_t size, off_t off, uint32_t flags, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -484,35 +483,57 @@ dht_readv (call_frame_t *frame, xlator_t *this, local->rebalance.offset = off; local->rebalance.size = size; + local->rebalance.flags = flags; local->call_cnt = 1; STACK_WIND (frame, dht_readv_cbk, subvol, subvol->fops->readv, - fd, size, off); + fd, size, off, flags, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL); + DHT_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); return 0; } int dht_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { int ret = -1; dht_local_t *local = NULL; + xlator_t *subvol = NULL; + call_frame_t *prev = NULL; local = frame->local; + prev = cookie; + if (!prev || !prev->this) + goto out; if (local->call_cnt != 1) goto out; + if ((op_ret == -1) && (op_errno == ENOTCONN) && + IA_ISDIR(local->loc.inode->ia_type)) { + + subvol = dht_subvol_next_available (this, prev->this); + if (!subvol) + goto out; + + /* check if we are done with visiting every node */ + if (subvol == local->cached_subvol) { + goto out; + } + STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access, + &local->loc, local->rebalance.flags, NULL); + return 0; + } if ((op_ret == -1) && (op_errno == ENOENT)) { /* File would be migrated to other node */ + local->op_errno = op_errno; local->rebalance.target_op_fn = dht_access2; ret = dht_rebalance_complete_check (frame->this, frame); if (!ret) @@ -520,7 +541,7 @@ dht_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } out: - DHT_STACK_UNWIND (access, frame, op_ret, op_errno); + DHT_STACK_UNWIND (access, frame, op_ret, op_errno, xdata); return 0; } @@ -543,18 +564,19 @@ dht_access2 (xlator_t *this, call_frame_t *frame, int op_ret) subvol = local->cached_subvol; STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access, - &local->loc, local->rebalance.flags); + &local->loc, local->rebalance.flags, NULL); return 0; out: - DHT_STACK_UNWIND (access, frame, -1, op_errno); + DHT_STACK_UNWIND (access, frame, -1, op_errno, NULL); return 0; } int -dht_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask) +dht_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -583,13 +605,13 @@ dht_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask) } STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access, - loc, mask); + loc, mask, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (access, frame, -1, op_errno); + DHT_STACK_UNWIND (access, frame, -1, op_errno, NULL); return 0; } @@ -597,10 +619,11 @@ err: int dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - int ret = -1; + dht_local_t *local = NULL; + inode_t *inode = NULL; + xlator_t *subvol = 0; local = frame->local; @@ -610,14 +633,14 @@ dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; /* If context is set, then send flush() it to the destination */ - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret) { + dht_inode_ctx_get1 (this, inode, &subvol); + if (subvol) { dht_flush2 (this, frame, 0); return 0; } out: - DHT_STACK_UNWIND (flush, frame, op_ret, op_errno); + DHT_STACK_UNWIND (flush, frame, op_ret, op_errno, xdata); return 0; } @@ -627,14 +650,10 @@ dht_flush2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -642,20 +661,19 @@ dht_flush2 (xlator_t *this, call_frame_t *frame, int op_ret) local->call_cnt = 2; /* This is the second attempt */ STACK_WIND (frame, dht_flush_cbk, - subvol, subvol->fops->flush, local->fd); + subvol, subvol->fops->flush, local->fd, NULL); return 0; } int -dht_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +dht_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; dht_local_t *local = NULL; - VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); @@ -677,13 +695,13 @@ dht_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) local->call_cnt = 1; STACK_WIND (frame, dht_flush_cbk, - subvol, subvol->fops->flush, fd); + subvol, subvol->fops->flush, fd, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (flush, frame, -1, op_errno); + DHT_STACK_UNWIND (flush, frame, -1, op_errno, NULL); return 0; } @@ -691,53 +709,63 @@ err: int dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, - int op_errno, struct iatt *prebuf, struct iatt *postbuf) + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + inode_t *inode = NULL; + xlator_t *subvol = 0; local = frame->local; prev = cookie; local->op_errno = op_errno; - if (op_ret == -1) { + if (op_ret == -1 && (op_errno != ENOENT)) { gf_log (this->name, GF_LOG_DEBUG, "subvolume %s returned -1 (%s)", prev->this->name, strerror (op_errno)); goto out; } - if (local->call_cnt != 1) + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } goto out; + } - ret = fd_ctx_get (local->fd, this, NULL); - if (ret) { + local->op_errno = op_errno; + dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { local->rebalance.target_op_fn = dht_fsync2; /* Check if the rebalance phase1 is true */ - if (IA_ISREG (postbuf->ia_type) && - (postbuf->ia_prot.sticky == 1) && - (postbuf->ia_prot.sgid == 1)) { + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + ret = dht_rebalance_in_progress_check (this, frame); } /* Check if the rebalance phase2 is true */ - if (IA_ISREG (postbuf->ia_type) && - ((st_mode_from_ia (postbuf->ia_prot, postbuf->ia_type) & - ~S_IFMT) == DHT_LINKFILE_MODE)) { + if (IS_DHT_MIGRATION_PHASE2 (postbuf)) { ret = dht_rebalance_complete_check (this, frame); } + if (!ret) + return 0; } else { dht_fsync2 (this, frame, 0); - ret = 0; - } - if (!ret) return 0; + } out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); DHT_STACK_UNWIND (fsync, frame, op_ret, op_errno, - prebuf, postbuf); + prebuf, postbuf, xdata); return 0; } @@ -747,34 +775,29 @@ dht_fsync2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; - + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; local->call_cnt = 2; /* This is the second attempt */ STACK_WIND (frame, dht_fsync_cbk, subvol, subvol->fops->fsync, - local->fd, local->rebalance.flags); + local->fd, local->rebalance.flags, NULL); return 0; } int -dht_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync) +dht_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync, + dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; dht_local_t *local = NULL; - VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); @@ -792,13 +815,13 @@ dht_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync) subvol = local->cached_subvol; STACK_WIND (frame, dht_fsync_cbk, subvol, subvol->fops->fsync, - fd, datasync); + fd, datasync, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL); + DHT_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -809,9 +832,9 @@ err: phase 2 of migration */ int dht_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct gf_flock *flock) + int op_ret, int op_errno, struct gf_flock *flock, dict_t *xdata) { - DHT_STACK_UNWIND (lk, frame, op_ret, op_errno, flock); + DHT_STACK_UNWIND (lk, frame, op_ret, op_errno, flock, xdata); return 0; } @@ -819,7 +842,7 @@ dht_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int dht_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int cmd, struct gf_flock *flock) + fd_t *fd, int cmd, struct gf_flock *flock, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -839,13 +862,13 @@ dht_lk (call_frame_t *frame, xlator_t *this, /* TODO: for rebalance, we need to preserve the fop arguments */ STACK_WIND (frame, dht_lk_cbk, subvol, subvol->fops->lk, fd, - cmd, flock); + cmd, flock, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (lk, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); return 0; } @@ -853,7 +876,8 @@ err: /* Symlinks are currently not migrated, so no need for any check here */ int dht_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, const char *path, struct iatt *sbuf) + int op_ret, int op_errno, const char *path, + struct iatt *stbuf, dict_t *xdata) { dht_local_t *local = NULL; @@ -867,14 +891,16 @@ dht_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } err: - DHT_STACK_UNWIND (readlink, frame, op_ret, op_errno, path, sbuf); + DHT_STRIP_PHASE1_FLAGS (stbuf); + DHT_STACK_UNWIND (readlink, frame, op_ret, op_errno, path, stbuf, xdata); return 0; } int -dht_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size) +dht_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -902,13 +928,13 @@ dht_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size) STACK_WIND (frame, dht_readlink_cbk, subvol, subvol->fops->readlink, - loc, size); + loc, size, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL); + DHT_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -919,16 +945,16 @@ err: int dht_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict) + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - DHT_STACK_UNWIND (xattrop, frame, op_ret, op_errno, dict); + DHT_STACK_UNWIND (xattrop, frame, op_ret, op_errno, dict, xdata); return 0; } int dht_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t flags, dict_t *dict) + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -959,13 +985,13 @@ dht_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, STACK_WIND (frame, dht_xattrop_cbk, subvol, subvol->fops->xattrop, - loc, flags, dict); + loc, flags, dict, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); return 0; } @@ -973,16 +999,16 @@ err: int dht_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict) + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - DHT_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, dict); + DHT_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, dict, xdata); return 0; } int dht_fxattrop (call_frame_t *frame, xlator_t *this, - fd_t *fd, gf_xattrop_flags_t flags, dict_t *dict) + fd_t *fd, gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -1002,13 +1028,13 @@ dht_fxattrop (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, dht_fxattrop_cbk, subvol, subvol->fops->fxattrop, - fd, flags, dict); + fd, flags, dict, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); return 0; } @@ -1016,17 +1042,17 @@ err: int dht_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - DHT_STACK_UNWIND (inodelk, frame, op_ret, op_errno); + DHT_STACK_UNWIND (inodelk, frame, op_ret, op_errno, xdata); return 0; } int32_t -dht_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *lock) +dht_inodelk (call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, int32_t cmd, struct gf_flock *lock, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -1058,31 +1084,31 @@ dht_inodelk (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, dht_inodelk_cbk, subvol, subvol->fops->inodelk, - volume, loc, cmd, lock); + volume, loc, cmd, lock, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (inodelk, frame, -1, op_errno); + DHT_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); return 0; } int -dht_finodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) +dht_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - DHT_STACK_UNWIND (finodelk, frame, op_ret, op_errno); + DHT_STACK_UNWIND (finodelk, frame, op_ret, op_errno, xdata); return 0; } int -dht_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *lock) +dht_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, + fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -1100,16 +1126,14 @@ dht_finodelk (call_frame_t *frame, xlator_t *this, } - STACK_WIND (frame, - dht_finodelk_cbk, - subvol, subvol->fops->finodelk, - volume, fd, cmd, lock); + STACK_WIND (frame, dht_finodelk_cbk, subvol, subvol->fops->finodelk, + volume, fd, cmd, lock, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (finodelk, frame, -1, op_errno); + DHT_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); return 0; } diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c index 21eca6117..4b3f3a049 100644 --- a/xlators/cluster/dht/src/dht-inode-write.c +++ b/xlators/cluster/dht/src/dht-inode-write.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -28,16 +19,20 @@ int dht_writev2 (xlator_t *this, call_frame_t *frame, int ret); int dht_truncate2 (xlator_t *this, call_frame_t *frame, int ret); int dht_setattr2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret); +int dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret); +int dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret); int dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { dht_local_t *local = NULL; int ret = -1; + xlator_t *subvol = NULL; - if (op_ret == -1) { + if (op_ret == -1 && (op_errno != ENOENT)) { goto out; } @@ -48,25 +43,32 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - if (local->call_cnt != 1) + if (local->call_cnt != 1) { + /* preserve the modes of source */ + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } goto out; + } local->rebalance.target_op_fn = dht_writev2; + local->op_errno = op_errno; /* Phase 2 of migration */ - if (IA_ISREG (postbuf->ia_type) && - ((st_mode_from_ia (postbuf->ia_prot, postbuf->ia_type) & - ~S_IFMT) == DHT_LINKFILE_MODE)) { + if (IS_DHT_MIGRATION_PHASE2 (postbuf)) { ret = dht_rebalance_complete_check (this, frame); if (!ret) return 0; } /* Check if the rebalance phase1 is true */ - if (IA_ISREG (postbuf->ia_type) && (postbuf->ia_prot.sticky == 1) && - (postbuf->ia_prot.sgid == 1)) { - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret) { + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + + ret = dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { dht_writev2 (this, frame, 0); return 0; } @@ -76,7 +78,11 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } out: - DHT_STACK_UNWIND (writev, frame, op_ret, op_errno, prebuf, postbuf); + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + + DHT_STACK_UNWIND (writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); return 0; } @@ -86,14 +92,10 @@ dht_writev2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -103,15 +105,16 @@ dht_writev2 (xlator_t *this, call_frame_t *frame, int op_ret) STACK_WIND (frame, dht_writev_cbk, subvol, subvol->fops->writev, local->fd, local->rebalance.vector, local->rebalance.count, - local->rebalance.offset, local->rebalance.iobref); + local->rebalance.offset, local->rebalance.flags, + local->rebalance.iobref, NULL); return 0; } int -dht_writev (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iovec *vector, int count, off_t off, - struct iobref *iobref) +dht_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int count, off_t off, uint32_t flags, + struct iobref *iobref, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -140,18 +143,19 @@ dht_writev (call_frame_t *frame, xlator_t *this, local->rebalance.vector = iov_dup (vector, count); local->rebalance.offset = off; local->rebalance.count = count; + local->rebalance.flags = flags; local->rebalance.iobref = iobref_ref (iobref); local->call_cnt = 1; STACK_WIND (frame, dht_writev_cbk, subvol, subvol->fops->writev, - fd, vector, count, off, iobref); + fd, vector, count, off, flags, iobref, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL); + DHT_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -161,11 +165,13 @@ err: int dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + xlator_t *subvol = NULL; + inode_t *inode = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -185,36 +191,44 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - if (local->call_cnt != 1) + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } goto out; + } local->rebalance.target_op_fn = dht_truncate2; + local->op_errno = op_errno; /* Phase 2 of migration */ - if ((op_ret == -1) || (IA_ISREG (postbuf->ia_type) && - ((st_mode_from_ia (postbuf->ia_prot, postbuf->ia_type) & - ~S_IFMT) == DHT_LINKFILE_MODE))) { + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { ret = dht_rebalance_complete_check (this, frame); if (!ret) - goto err; + return 0; } /* Check if the rebalance phase1 is true */ - if (IA_ISREG (postbuf->ia_type) && (postbuf->ia_prot.sticky == 1) && - (postbuf->ia_prot.sgid == 1)) { - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret) { + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + inode = (local->fd) ? local->fd->inode : local->loc.inode; + dht_inode_ctx_get1 (this, inode, &subvol); + if (subvol) { dht_truncate2 (this, frame, 0); - goto err; + return 0; } ret = dht_rebalance_in_progress_check (this, frame); if (!ret) - goto err; + return 0; } out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); DHT_STACK_UNWIND (truncate, frame, op_ret, op_errno, - prebuf, postbuf); + prebuf, postbuf, xdata); err: return 0; } @@ -225,16 +239,13 @@ dht_truncate2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; + inode_t *inode = NULL; local = frame->local; - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + inode = local->fd ? local->fd->inode : local->loc.inode; + dht_inode_ctx_get1 (this, inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -243,18 +254,19 @@ dht_truncate2 (xlator_t *this, call_frame_t *frame, int op_ret) if (local->fop == GF_FOP_TRUNCATE) { STACK_WIND (frame, dht_truncate_cbk, subvol, subvol->fops->truncate, &local->loc, - local->rebalance.offset); + local->rebalance.offset, NULL); } else { STACK_WIND (frame, dht_truncate_cbk, subvol, subvol->fops->ftruncate, local->fd, - local->rebalance.offset); + local->rebalance.offset, NULL); } return 0; } int -dht_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) +dht_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -284,19 +296,20 @@ dht_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) STACK_WIND (frame, dht_truncate_cbk, subvol, subvol->fops->truncate, - loc, offset); + loc, offset, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL); + DHT_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); return 0; } int -dht_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) +dht_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -324,22 +337,423 @@ dht_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) STACK_WIND (frame, dht_truncate_cbk, subvol, subvol->fops->ftruncate, - fd, offset); + fd, offset, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + +int +dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + xlator_t *subvol = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + local->rebalance.target_op_fn = dht_fallocate2; + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { + dht_fallocate2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (fallocate, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + +int +dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_fallocate_cbk, subvol, subvol->fops->fallocate, + local->fd, local->rebalance.flags, local->rebalance.offset, + local->rebalance.size, NULL); + + return 0; +} + +int +dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FALLOCATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.flags = mode; + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_fallocate_cbk, + subvol, subvol->fops->fallocate, + fd, mode, offset, len, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL); + DHT_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); return 0; } + +int +dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + xlator_t *subvol = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + local->rebalance.target_op_fn = dht_discard2; + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { + dht_discard2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (discard, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + +int +dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_discard_cbk, subvol, subvol->fops->discard, + local->fd, local->rebalance.offset, local->rebalance.size, + NULL); + + return 0; +} + +int +dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_DISCARD); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_discard_cbk, subvol, subvol->fops->discard, + fd, offset, len, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +int +dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + local->rebalance.target_op_fn = dht_zerofill2; + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + ret = fd_ctx_get (local->fd, this, NULL); + if (!ret) { + dht_zerofill2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (zerofill, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + +int +dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + uint64_t tmp_subvol = 0; + int ret = -1; + + local = frame->local; + + if (local->fd) + ret = fd_ctx_get (local->fd, this, &tmp_subvol); + if (!ret) + subvol = (xlator_t *)(long)tmp_subvol; + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill, + local->fd, local->rebalance.offset, local->rebalance.size, + NULL); + + return 0; +} + +int +dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_ZEROFILL); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill, + fd, offset, len, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + + /* handle cases of migration here for 'setattr()' calls */ int dht_file_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -362,21 +776,21 @@ dht_file_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->rebalance.target_op_fn = dht_setattr2; /* Phase 2 of migration */ - if ((op_ret == -1) || (IA_ISREG (postbuf->ia_type) && - ((st_mode_from_ia (postbuf->ia_prot, postbuf->ia_type) & - ~S_IFMT) == DHT_LINKFILE_MODE))) { + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { ret = dht_rebalance_complete_check (this, frame); if (!ret) - goto out; + return 0; } /* At the end of the migration process, whatever 'attr' we have on source file will be migrated to destination file in one shot, hence we don't need to check for in progress - state here */ + state here (ie, PHASE1) */ out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); DHT_STACK_UNWIND (setattr, frame, op_ret, op_errno, - prebuf, postbuf); + prebuf, postbuf, xdata); return 0; } @@ -386,15 +800,13 @@ dht_setattr2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; + inode_t *inode = NULL; local = frame->local; - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + inode = (local->fd) ? local->fd->inode : local->loc.inode; + + dht_inode_ctx_get1 (this, inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -404,11 +816,13 @@ dht_setattr2 (xlator_t *this, call_frame_t *frame, int op_ret) if (local->fop == GF_FOP_SETATTR) { STACK_WIND (frame, dht_file_setattr_cbk, subvol, subvol->fops->setattr, &local->loc, - &local->rebalance.stbuf, local->rebalance.flags); + &local->rebalance.stbuf, local->rebalance.flags, + NULL); } else { STACK_WIND (frame, dht_file_setattr_cbk, subvol, subvol->fops->fsetattr, local->fd, - &local->rebalance.stbuf, local->rebalance.flags); + &local->rebalance.stbuf, local->rebalance.flags, + NULL); } return 0; @@ -419,7 +833,7 @@ dht_setattr2 (xlator_t *this, call_frame_t *frame, int op_ret) int dht_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *statpre, - struct iatt *statpost) + struct iatt *statpost, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; @@ -450,7 +864,7 @@ unlock: this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) DHT_STACK_UNWIND (setattr, frame, local->op_ret, local->op_errno, - &local->prebuf, &local->stbuf); + &local->prebuf, &local->stbuf, xdata); return 0; } @@ -458,13 +872,14 @@ unlock: int dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid) + struct iatt *stbuf, int32_t valid, dict_t *xdata) { xlator_t *subvol = NULL; dht_layout_t *layout = NULL; dht_local_t *local = NULL; int op_errno = -1; int i = -1; + int call_cnt = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -503,25 +918,25 @@ dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, STACK_WIND (frame, dht_file_setattr_cbk, subvol, subvol->fops->setattr, - loc, stbuf, valid); + loc, stbuf, valid, xdata); return 0; } - local->call_cnt = layout->cnt; + local->call_cnt = call_cnt = layout->cnt; - for (i = 0; i < layout->cnt; i++) { + for (i = 0; i < call_cnt; i++) { STACK_WIND (frame, dht_setattr_cbk, layout->list[i].xlator, layout->list[i].xlator->fops->setattr, - loc, stbuf, valid); + loc, stbuf, valid, xdata); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL); + DHT_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -529,13 +944,14 @@ err: int dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, - int32_t valid) + int32_t valid, dict_t *xdata) { xlator_t *subvol = NULL; dht_layout_t *layout = NULL; dht_local_t *local = NULL; int op_errno = -1; int i = -1; + int call_cnt = 0; VALIDATE_OR_GOTO (frame, err); @@ -573,25 +989,25 @@ dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, STACK_WIND (frame, dht_file_setattr_cbk, subvol, subvol->fops->fsetattr, - fd, stbuf, valid); + fd, stbuf, valid, xdata); return 0; } - local->call_cnt = layout->cnt; + local->call_cnt = call_cnt = layout->cnt; - for (i = 0; i < layout->cnt; i++) { + for (i = 0; i < call_cnt; i++) { STACK_WIND (frame, dht_setattr_cbk, layout->list[i].xlator, layout->list[i].xlator->fops->fsetattr, - fd, stbuf, valid); + fd, stbuf, valid, xdata); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL); + DHT_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); return 0; } diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c index 7e1f7afda..38e9970a7 100644 --- a/xlators/cluster/dht/src/dht-layout.c +++ b/xlators/cluster/dht/src/dht-layout.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -68,9 +59,7 @@ dht_layout_t * dht_layout_get (xlator_t *this, inode_t *inode) { dht_conf_t *conf = NULL; - uint64_t layout_int = 0; dht_layout_t *layout = NULL; - int ret = -1; conf = this->private; if (!conf) @@ -78,9 +67,8 @@ dht_layout_get (xlator_t *this, inode_t *inode) LOCK (&conf->layout_lock); { - ret = inode_ctx_get (inode, this, &layout_int); - if (ret == 0) { - layout = (dht_layout_t *) (unsigned long) layout_int; + dht_inode_ctx_layout_get (inode, this, &layout); + if (layout) { layout->ref++; } } @@ -98,7 +86,6 @@ dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout) int oldret = -1; int ret = 0; dht_layout_t *old_layout; - uint64_t old_layout_int; conf = this->private; if (!conf) @@ -106,16 +93,13 @@ dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout) LOCK (&conf->layout_lock); { - oldret = inode_ctx_get (inode, this, &old_layout_int); - + oldret = dht_inode_ctx_layout_get (inode, this, &old_layout); layout->ref++; - ret = inode_ctx_put (inode, this, (uint64_t) (unsigned long) - layout); + dht_inode_ctx_layout_set (inode, this, layout); } UNLOCK (&conf->layout_lock); - if (oldret == 0) { - old_layout = (dht_layout_t *) (unsigned long) old_layout_int; + if (!oldret) { dht_layout_unref (this, old_layout); } @@ -130,7 +114,7 @@ dht_layout_unref (xlator_t *this, dht_layout_t *layout) dht_conf_t *conf = NULL; int ref = 0; - if (layout->preset || !this->private) + if (!layout || layout->preset || !this->private) return; conf = this->private; @@ -174,9 +158,9 @@ dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name) int ret = 0; - ret = dht_hash_compute (layout->type, name, &hash); + ret = dht_hash_compute (this, layout->type, name, &hash); if (ret != 0) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_WARNING, "hash computation failed for type=%d name=%s", layout->type, name); goto out; @@ -191,7 +175,7 @@ dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name) } if (!subvol) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_WARNING, "no subvolume for hash (value) = %u", hash); } @@ -280,6 +264,9 @@ dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, if (disk_layout_p) *disk_layout_p = disk_layout; + else + GF_FREE (disk_layout); + ret = 0; out: @@ -289,7 +276,7 @@ out: int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, - int pos, void *disk_layout_raw) + int pos, void *disk_layout_raw, int disk_layout_len) { int cnt = 0; int type = 0; @@ -297,19 +284,38 @@ dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, int stop_off = 0; int disk_layout[4]; - /* TODO: assert disk_layout_ptr is of required length */ + if (!disk_layout_raw) { + gf_log (this->name, GF_LOG_CRITICAL, + "error no layout on disk for merge"); + return -1; + } - memcpy (disk_layout, disk_layout_raw, sizeof (disk_layout)); + GF_ASSERT (disk_layout_len == sizeof (disk_layout)); + + memcpy (disk_layout, disk_layout_raw, disk_layout_len); cnt = ntoh32 (disk_layout[0]); if (cnt != 1) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_ERROR, "disk layout has invalid count %d", cnt); return -1; } - /* TODO: assert type is compatible */ - type = ntoh32 (disk_layout[1]); + type = ntoh32 (disk_layout[1]); + switch (type) { + case DHT_HASH_TYPE_DM_USER: + gf_log (this->name, GF_LOG_DEBUG, "found user-set layout"); + layout->type = type; + /* Fall through. */ + case DHT_HASH_TYPE_DM: + break; + default: + gf_log (this->name, GF_LOG_CRITICAL, + "Catastrophic error layout with unknown type found %d", + disk_layout[1]); + return -1; + } + start_off = ntoh32 (disk_layout[2]); stop_off = ntoh32 (disk_layout[3]); @@ -329,11 +335,12 @@ int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, int op_ret, int op_errno, dict_t *xattr) { - int i = 0; - int ret = -1; - int err = -1; - void *disk_layout_raw = NULL; - + int i = 0; + int ret = -1; + int err = -1; + void *disk_layout_raw = NULL; + int disk_layout_len = 0; + dht_conf_t *conf = this->private; if (op_ret != 0) { err = op_errno; @@ -354,12 +361,12 @@ dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, if (xattr) { /* during lookup and not mkdir */ - ret = dict_get_ptr (xattr, "trusted.glusterfs.dht", - &disk_layout_raw); + ret = dict_get_ptr_and_len (xattr, conf->xattr_name, + &disk_layout_raw, &disk_layout_len); } if (ret != 0) { - layout->list[i].err = -1; + layout->list[i].err = 0; gf_log (this->name, GF_LOG_TRACE, "missing disk layout on %s. err = %d", subvol->name, err); @@ -367,9 +374,10 @@ dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, goto out; } - ret = dht_disk_layout_merge (this, layout, i, disk_layout_raw); + ret = dht_disk_layout_merge (this, layout, i, disk_layout_raw, + disk_layout_len); if (ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_WARNING, "layout merge from subvolume %s failed", subvol->name); goto out; @@ -405,6 +413,22 @@ dht_layout_entry_swap (dht_layout_t *layout, int i, int j) layout->list[j].err = err_swap; } +void +dht_layout_range_swap (dht_layout_t *layout, int i, int j) +{ + uint32_t start_swap = 0; + uint32_t stop_swap = 0; + + start_swap = layout->list[i].start; + stop_swap = layout->list[i].stop; + + layout->list[i].start = layout->list[j].start; + layout->list[i].stop = layout->list[j].stop; + + layout->list[j].start = start_swap; + layout->list[j].stop = stop_swap; +} + int64_t dht_layout_entry_cmp_volname (dht_layout_t *layout, int i, int j) { @@ -412,17 +436,37 @@ dht_layout_entry_cmp_volname (dht_layout_t *layout, int i, int j) layout->list[j].xlator->name)); } + +gf_boolean_t +dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator) +{ + int i = 0; + + for (i = 0; i < layout->cnt; i++) { + if (!strcmp (layout->list[i].xlator->name, xlator->name)) + return _gf_true; + } + return _gf_false; +} + int64_t dht_layout_entry_cmp (dht_layout_t *layout, int i, int j) { int64_t diff = 0; + /* swap zero'ed out layouts to front, if needed */ + if (!layout->list[j].start && !layout->list[j].stop) { + diff = (int64_t) layout->list[i].stop + - (int64_t) layout->list[j].stop; + goto out; + } if (layout->list[i].err || layout->list[j].err) diff = layout->list[i].err - layout->list[j].err; else diff = (int64_t) layout->list[i].start - (int64_t) layout->list[j].start; +out: return diff; } @@ -471,7 +515,8 @@ dht_layout_sort_volname (dht_layout_t *layout) int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, uint32_t *holes_p, uint32_t *overlaps_p, - uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p) + uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p, + uint32_t *no_space_p) { uint32_t overlaps = 0; uint32_t missing = 0; @@ -484,30 +529,38 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, uint32_t prev_stop = 0; uint32_t last_stop = 0; char is_virgin = 1; + uint32_t no_space = 0; - /* TODO: explain WTF is happening */ + /* TODO: explain what is happening */ last_stop = layout->list[0].start - 1; prev_stop = last_stop; for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err) { - switch (layout->list[i].err) { - case -1: - case ENOENT: - missing++; - break; - case ENOTCONN: - down++; - break; - case ENOSPC: - down++; - break; - default: - misc++; + switch (layout->list[i].err) { + case -1: + case ENOENT: + missing++; + continue; + case ENOTCONN: + down++; + continue; + case ENOSPC: + no_space++; + continue; + case 0: + /* if err == 0 and start == stop, then it is a non misc++; + * participating subvolume(spread-cnt). Then, do not + * check for anomalies. If start != stop, then treat it + * as misc err */ + if (layout->list[i].start == layout->list[i].stop) { + continue; } + break; + default: + misc++; continue; - } + } is_virgin = 0; @@ -540,6 +593,9 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, if (misc_p) *misc_p = misc; + if (no_space_p) + *no_space_p = no_space; + return ret; } @@ -555,7 +611,6 @@ dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout) uint32_t down = 0; uint32_t misc = 0; - ret = dht_layout_sort (layout); if (ret == -1) { gf_log (this->name, GF_LOG_WARNING, @@ -565,7 +620,7 @@ dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout) ret = dht_layout_anomalies (this, loc, layout, &holes, &overlaps, - &missing, &down, &misc); + &missing, &down, &misc, NULL); if (ret == -1) { gf_log (this->name, GF_LOG_WARNING, "error while finding anomalies in %s -- not good news", @@ -583,43 +638,56 @@ dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout) "found anomalies in %s. holes=%d overlaps=%d", loc->path, holes, overlaps); } - ret = 1; + ret = -1; } for (i = 0; i < layout->cnt; i++) { - /* TODO During DHT selfheal rewrite (almost) find a better place to - * detect this - probably in dht_layout_anomalies() + /* TODO During DHT selfheal rewrite (almost) find a better place + * to detect this - probably in dht_layout_anomalies() */ if (layout->list[i].err > 0) { - gf_log (this->name, GF_LOG_DEBUG, - "path=%s err=%s on subvol=%s", - loc->path, strerror (layout->list[i].err), - (layout->list[i].xlator ? - layout->list[i].xlator->name : "<>")); - if (layout->list[i].err == ENOENT) - ret = 1; + gf_log_callingfn (this->name, GF_LOG_DEBUG, + "path=%s err=%s on subvol=%s", + loc->path, + strerror (layout->list[i].err), + (layout->list[i].xlator ? + layout->list[i].xlator->name + : "<>")); + if ((layout->list[i].err == ENOENT) && (ret >= 0)) { + ret++; + } } } + out: return ret; } +int +dht_dir_has_layout (dict_t *xattr, char *name) +{ + + void *disk_layout_raw = NULL; + + return dict_get_ptr (xattr, name, &disk_layout_raw); +} int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, loc_t *loc, dict_t *xattr) { - int idx = 0; - int pos = -1; - int ret = 0; - int err = 0; - int dict_ret = 0; - int32_t disk_layout[4]; - void *disk_layout_raw = NULL; - int32_t count = -1; - uint32_t start_off = -1; - uint32_t stop_off = -1; + int idx = 0; + int pos = -1; + int ret = 0; + int err = 0; + int dict_ret = 0; + int32_t disk_layout[4]; + void *disk_layout_raw = NULL; + int32_t count = -1; + uint32_t start_off = -1; + uint32_t stop_off = -1; + dht_conf_t *conf = this->private; for (idx = 0; idx < layout->cnt; idx++) { @@ -649,7 +717,7 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, goto out; } - dict_ret = dict_get_ptr (xattr, "trusted.glusterfs.dht", + dict_ret = dict_get_ptr (xattr, conf->xattr_name, &disk_layout_raw); if (dict_ret < 0) { @@ -665,7 +733,7 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, count = ntoh32 (disk_layout[0]); if (count != 1) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_ERROR, "%s - disk layout has invalid count %d", loc->path, count); ret = -1; @@ -714,7 +782,7 @@ dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode) LOCK (&conf->layout_lock); { - inode_ctx_put (inode, this, (uint64_t)(long)layout); + dht_inode_ctx_layout_set (inode, this, layout); } UNLOCK (&conf->layout_lock); diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c index 2186b064a..dbc9d0b3c 100644 --- a/xlators/cluster/dht/src/dht-linkfile.c +++ b/xlators/cluster/dht/src/dht-linkfile.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -28,37 +19,106 @@ #include "compat.h" #include "dht-common.h" +int +dht_linkfile_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) +{ + char is_linkfile = 0; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + local = frame->local; + prev = cookie; + conf = this->private; + + if (op_ret) + goto out; + + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); + if (!is_linkfile) + gf_log (this->name, GF_LOG_WARNING, "got non-linkfile %s:%s", + prev->this->name, local->loc.path); +out: + local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno, + inode, stbuf, postparent, postparent, + xattr); + return 0; +} +#define is_equal(a, b) (a == b) int dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; + xlator_t *subvol = NULL; + call_frame_t *prev = NULL; + dict_t *xattrs = NULL; + dht_conf_t *conf = NULL; + int ret = -1; local = frame->local; + if (!op_ret) + local->linked = _gf_true; + + FRAME_SU_UNDO (frame, dht_local_t); + + if (op_ret && (op_errno == EEXIST)) { + conf = this->private; + prev = cookie; + subvol = prev->this; + if (!subvol) + goto out; + xattrs = dict_new (); + if (!xattrs) + goto out; + ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set linkto key"); + goto out; + } + + STACK_WIND (frame, dht_linkfile_lookup_cbk, subvol, + subvol->fops->lookup, &local->loc, xattrs); + if (xattrs) + dict_unref (xattrs); + return 0; + } +out: local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno, - inode, stbuf, preparent, postparent); + inode, stbuf, preparent, postparent, + xdata); + if (xattrs) + dict_unref (xattrs); return 0; } int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, + xlator_t *this, xlator_t *tovol, xlator_t *fromvol, loc_t *loc) { dht_local_t *local = NULL; dict_t *dict = NULL; int need_unref = 0; int ret = 0; + dht_conf_t *conf = this->private; local = frame->local; local->linkfile.linkfile_cbk = linkfile_cbk; local->linkfile.srcvol = tovol; + local->linked = _gf_false; + dict = local->params; if (!dict) { dict = dict_new (); @@ -74,8 +134,12 @@ dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, "%s: gfid set failed", loc->path); } - ret = dict_set_str (dict, "trusted.glusterfs.dht.linkto", - tovol->name); + ret = dict_set_str (dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); + if (ret) + gf_log ("dht-linkfile", GF_LOG_INFO, + "%s: internal-fop set failed", loc->path); + + ret = dict_set_str (dict, conf->link_xattr_name, tovol->name); if (ret < 0) { gf_log (frame->this->name, GF_LOG_INFO, @@ -84,9 +148,13 @@ dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, goto out; } + local->link_subvol = fromvol; + /* Always create as root:root. dht_linkfile_attr_heal fixes the + * ownsership */ + FRAME_SU_DO (frame, dht_local_t); STACK_WIND (frame, dht_linkfile_create_cbk, fromvol, fromvol->fops->mknod, loc, - S_IFREG | DHT_LINKFILE_MODE, 0, dict); + S_IFREG | DHT_LINKFILE_MODE, 0, 0, dict); if (need_unref && dict) dict_unref (dict); @@ -94,7 +162,7 @@ dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, return 0; out: local->linkfile.linkfile_cbk (frame, NULL, frame->this, -1, ENOMEM, - loc->inode, NULL, NULL, NULL); + loc->inode, NULL, NULL, NULL, NULL); if (need_unref && dict) dict_unref (dict); @@ -106,7 +174,8 @@ out: int dht_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -150,7 +219,7 @@ dht_linkfile_unlink (call_frame_t *frame, xlator_t *this, STACK_WIND (unlink_frame, dht_linkfile_unlink_cbk, subvol, subvol->fops->unlink, - &unlink_local->loc); + &unlink_local->loc, 0, NULL); return 0; err: @@ -175,7 +244,7 @@ dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *stbuf, if (!xattr) goto out; - ret = dict_get_ptr (xattr, "trusted.glusterfs.dht.linkto", &volname); + ret = dict_get_ptr (xattr, conf->link_xattr_name, &volname); if ((-1 == ret) || !volname) goto out; @@ -190,3 +259,70 @@ dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *stbuf, out: return subvol; } + +int +dht_linkfile_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ + dht_local_t *local = NULL; + loc_t *loc = NULL; + + local = frame->local; + loc = &local->loc; + + if (op_ret) + gf_log (this->name, GF_LOG_ERROR, "setattr of uid/gid on %s" + " :<gfid:%s> failed (%s)", + (loc->path? loc->path: "NULL"), + uuid_utoa(local->gfid), strerror(op_errno)); + + DHT_STACK_DESTROY (frame); + + return 0; +} + +int +dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this) +{ + int ret = -1; + call_frame_t *copy = NULL; + dht_local_t *local = NULL; + dht_local_t *copy_local = NULL; + xlator_t *subvol = NULL; + struct iatt stbuf = {0,}; + + local = frame->local; + + GF_VALIDATE_OR_GOTO ("dht", local, out); + GF_VALIDATE_OR_GOTO ("dht", local->link_subvol, out); + + if (local->stbuf.ia_type == IA_INVAL) + return 0; + + uuid_copy (local->loc.gfid, local->stbuf.ia_gfid); + + copy = copy_frame (frame); + + if (!copy) + goto out; + + copy_local = dht_local_init (copy, &local->loc, NULL, 0); + + if (!copy_local) + goto out; + + stbuf = local->stbuf; + subvol = local->link_subvol; + + copy->local = copy_local; + + FRAME_SU_DO (copy, dht_local_t); + + STACK_WIND (copy, dht_linkfile_setattr_cbk, subvol, + subvol->fops->setattr, ©_local->loc, + &stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL); + ret = 0; +out: + return ret; +} diff --git a/xlators/cluster/dht/src/dht-mem-types.h b/xlators/cluster/dht/src/dht-mem-types.h index 21fb5a7ca..e893eb48f 100644 --- a/xlators/cluster/dht/src/dht-mem-types.h +++ b/xlators/cluster/dht/src/dht-mem-types.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -28,7 +19,6 @@ enum gf_dht_mem_types_ { gf_dht_mt_dht_conf_t, gf_dht_mt_char, gf_dht_mt_int32_t, - gf_dht_mt_dht_local_t, gf_dht_mt_xlator_t, gf_dht_mt_dht_layout_t, gf_switch_mt_dht_conf_t, @@ -37,6 +27,9 @@ enum gf_dht_mem_types_ { gf_switch_mt_switch_struct, gf_dht_mt_subvol_time, gf_dht_mt_loc_t, + gf_defrag_info_mt, + gf_dht_mt_inode_ctx_t, + gf_dht_mt_ctx_stat_time_t, gf_dht_mt_end }; #endif diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index 7b04e8a2d..bcb19f23e 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -24,11 +15,12 @@ #endif #include "dht-common.h" +#include "xlator.h" +#include <fnmatch.h> #define GF_DISK_SECTOR_SIZE 512 #define DHT_REBALANCE_PID 4242 /* Change it if required */ #define DHT_REBALANCE_BLKSIZE (128 * 1024) -#define DHT_MIGRATE_EVEN_IF_LINK_EXISTS 1 static int dht_write_with_holes (xlator_t *to, fd_t *fd, struct iovec *vec, int count, @@ -60,9 +52,14 @@ dht_write_with_holes (xlator_t *to, fd_t *fd, struct iovec *vec, int count, ret = syncop_write (to, fd, (buf + tmp_offset), (start_idx - tmp_offset), (offset + tmp_offset), - iobref); - if (ret < 0) + iobref, 0); + /* 'path' will be logged in calling function */ + if (ret < 0) { + gf_log (THIS->name, GF_LOG_WARNING, + "failed to write (%s)", + strerror (errno)); goto out; + } write_needed = 0; } @@ -73,9 +70,14 @@ dht_write_with_holes (xlator_t *to, fd_t *fd, struct iovec *vec, int count, /* This means, last chunk is not yet written.. write it */ ret = syncop_write (to, fd, (buf + tmp_offset), (buf_len - tmp_offset), - (offset + tmp_offset), iobref); - if (ret < 0) + (offset + tmp_offset), iobref, 0); + if (ret < 0) { + /* 'path' will be logged in calling function */ + gf_log (THIS->name, GF_LOG_WARNING, + "failed to write (%s)", + strerror (errno)); goto out; + } } size_pending = (size - buf_len); @@ -89,25 +91,147 @@ out: } +int32_t +gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs, + struct iatt *stbuf) +{ + int32_t ret = -1; + xlator_t *cached_subvol = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *linkto_subvol = NULL; + data_t *data = NULL; + struct iatt iatt = {0,}; + int32_t op_errno = 0; + dht_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO ("defrag", loc, out); + GF_VALIDATE_OR_GOTO ("defrag", loc->name, out); + GF_VALIDATE_OR_GOTO ("defrag", stbuf, out); + GF_VALIDATE_OR_GOTO ("defrag", this, out); + GF_VALIDATE_OR_GOTO ("defrag", xattrs, out); + GF_VALIDATE_OR_GOTO ("defrag", this->private, out); + + conf = this->private; + + if (uuid_is_null (loc->pargfid)) { + gf_log ("", GF_LOG_ERROR, "loc->pargfid is NULL for " + "%s", loc->path); + goto out; + } + + if (uuid_is_null (loc->gfid)) { + gf_log ("", GF_LOG_ERROR, "loc->gfid is NULL for " + "%s", loc->path); + goto out; + } + + cached_subvol = dht_subvol_get_cached (this, loc->inode); + if (!cached_subvol) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get cached subvol" + " for %s on %s", loc->name, this->name); + goto out; + } + + hashed_subvol = dht_subvol_get_hashed (this, loc); + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get hashed subvol" + " for %s on %s", loc->name, this->name); + goto out; + } + + gf_log (this->name, GF_LOG_INFO, "Attempting to migrate hardlink %s " + "with gfid %s from %s -> %s", loc->name, uuid_utoa (loc->gfid), + cached_subvol->name, hashed_subvol->name); + data = dict_get (xattrs, conf->link_xattr_name); + /* set linkto on cached -> hashed if not present, else link it */ + if (!data) { + ret = dict_set_str (xattrs, conf->link_xattr_name, + hashed_subvol->name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set " + "linkto xattr in dict for %s", loc->name); + goto out; + } + + ret = syncop_setxattr (cached_subvol, loc, xattrs, 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Linkto setxattr " + "failed %s -> %s (%s)", cached_subvol->name, + loc->name, strerror (errno)); + goto out; + } + goto out; + } else { + linkto_subvol = dht_linkfile_subvol (this, NULL, NULL, xattrs); + if (!linkto_subvol) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "linkto subvol for %s", loc->name); + } else { + hashed_subvol = linkto_subvol; + } + + ret = syncop_link (hashed_subvol, loc, loc); + if (ret) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "link of %s -> %s" + " failed on subvol %s (%s)", loc->name, + uuid_utoa(loc->gfid), + hashed_subvol->name, strerror (op_errno)); + if (op_errno != EEXIST) + goto out; + } + } + ret = syncop_lookup (hashed_subvol, loc, NULL, &iatt, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed lookup %s on %s (%s)" + , loc->name, hashed_subvol->name, strerror (errno)); + goto out; + } + + if (iatt.ia_nlink == stbuf->ia_nlink) { + ret = dht_migrate_file (this, loc, cached_subvol, hashed_subvol, + GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS); + if (ret) + goto out; + } + ret = 0; +out: + return ret; +} + + static inline int -__is_file_migratable (xlator_t *this, loc_t *loc, dict_t *rsp_dict, - struct iatt *stbuf) +__is_file_migratable (xlator_t *this, loc_t *loc, + struct iatt *stbuf, dict_t *xattrs, int flags) { int ret = -1; - if (!IA_ISREG (stbuf->ia_type)) { + if (IA_ISDIR (stbuf->ia_type)) { gf_log (this->name, GF_LOG_WARNING, - "%s: migrate-file called on non-regular entry (0%o)", - loc->path, stbuf->ia_type); + "%s: migrate-file called on directory", loc->path); ret = -1; goto out; } + if (flags == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS) { + ret = 0; + goto out; + } if (stbuf->ia_nlink > 1) { - /* TODO : support migrating hardlinks */ - gf_log (this->name, GF_LOG_WARNING, "%s: file has hardlinks", - loc->path); - ret = -ENOTSUP; + /* support for decomission */ + if (flags == GF_DHT_MIGRATE_HARDLINK) { + ret = gf_defrag_handle_hardlink (this, loc, + xattrs, stbuf); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to migrate file with link", + loc->path); + } + } else { + gf_log (this->name, GF_LOG_WARNING, + "%s: file has hardlinks", loc->path); + } + ret = ENOTSUP; goto out; } @@ -118,15 +242,17 @@ out: } static inline int -__dht_rebalance_create_dst_file (xlator_t *to, loc_t *loc, struct iatt *stbuf, - dict_t *dict, fd_t **dst_fd) +__dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struct iatt *stbuf, + dict_t *dict, fd_t **dst_fd, dict_t *xattr) { - xlator_t *this = NULL; - int ret = -1; - fd_t *fd = NULL; - struct iatt new_stbuf = {0,}; + xlator_t *this = NULL; + int ret = -1; + fd_t *fd = NULL; + struct iatt new_stbuf = {0,}; + dht_conf_t *conf = NULL; this = THIS; + conf = this->private; ret = dict_set_static_bin (dict, "gfid-req", stbuf->ia_gfid, 16); if (ret) { @@ -135,7 +261,7 @@ __dht_rebalance_create_dst_file (xlator_t *to, loc_t *loc, struct iatt *stbuf, goto out; } - ret = dict_set_str (dict, DHT_LINKFILE_KEY, to->name); + ret = dict_set_str (dict, conf->link_xattr_name, from->name); if (ret) { gf_log (this->name, GF_LOG_ERROR, "%s: failed to set gfid in dict for create", loc->path); @@ -145,7 +271,8 @@ __dht_rebalance_create_dst_file (xlator_t *to, loc_t *loc, struct iatt *stbuf, fd = fd_create (loc->inode, DHT_REBALANCE_PID); if (!fd) { gf_log (this->name, GF_LOG_ERROR, - "%s: fd create failed (destination)", loc->path); + "%s: fd create failed (destination) (%s)", + loc->path, strerror (errno)); ret = -1; goto out; } @@ -161,17 +288,45 @@ __dht_rebalance_create_dst_file (xlator_t *to, loc_t *loc, struct iatt *stbuf, goto out; } } + if ((ret == -1) && (errno != ENOENT)) { + /* File exists in destination, but not accessible */ + gf_log (THIS->name, GF_LOG_WARNING, + "%s: failed to lookup file (%s)", + loc->path, strerror (errno)); + goto out; + } /* Create the destination with LINKFILE mode, and linkto xattr, if the linkfile already exists, it will just open the file */ ret = syncop_create (to, loc, O_RDWR, DHT_LINKFILE_MODE, fd, - dict); + dict, &new_stbuf); if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, - "failed to create %s on %s", loc->path, to->name); + "failed to create %s on %s (%s)", + loc->path, to->name, strerror (errno)); goto out; } + ret = syncop_fsetxattr (to, fd, xattr, 0); + if (ret == -1) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set xattr on %s (%s)", + loc->path, to->name, strerror (errno)); + + ret = syncop_ftruncate (to, fd, stbuf->ia_size); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "ftruncate failed for %s on %s (%s)", + loc->path, to->name, strerror (errno)); + + ret = syncop_fsetattr (to, fd, stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID), + NULL, NULL); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "chown failed for %s on %s (%s)", + loc->path, to->name, strerror (errno)); + if (dst_fd) *dst_fd = fd; @@ -184,41 +339,75 @@ out: static inline int __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, - struct iatt *stbuf) + struct iatt *stbuf, int flag) { struct statvfs src_statfs = {0,}; struct statvfs dst_statfs = {0,}; int ret = -1; xlator_t *this = NULL; + uint64_t src_statfs_blocks = 1; + uint64_t dst_statfs_blocks = 1; + this = THIS; ret = syncop_statfs (from, loc, &src_statfs); if (ret) { gf_log (this->name, GF_LOG_ERROR, - "failed to get statfs of %s on %s", - loc->path, from->name); + "failed to get statfs of %s on %s (%s)", + loc->path, from->name, strerror (errno)); goto out; } ret = syncop_statfs (to, loc, &dst_statfs); if (ret) { gf_log (this->name, GF_LOG_ERROR, - "failed to get statfs of %s on %s", - loc->path, to->name); + "failed to get statfs of %s on %s (%s)", + loc->path, to->name, strerror (errno)); goto out; } - if (((dst_statfs.f_bavail * - dst_statfs.f_bsize) / GF_DISK_SECTOR_SIZE) > - (((src_statfs.f_bavail * src_statfs.f_bsize) / - GF_DISK_SECTOR_SIZE) - stbuf->ia_blocks)) { - gf_log (this->name, GF_LOG_WARNING, - "data movement attempted from node (%s) with" - " higher disk space to a node (%s) with " - "lesser disk space (%s)", from->name, - to->name, loc->path); - ret = -1; + /* if force option is given, do not check for space @ dst. + * Check only if space is avail for the file */ + if (flag != GF_DHT_MIGRATE_DATA) + goto check_avail_space; + + /* Check: + During rebalance `migrate-data` - Destination subvol experiences + a `reduction` in 'blocks' of free space, at the same time source + subvol gains certain 'blocks' of free space. A valid check is + necessary here to avoid errorneous move to destination where + the space could be scantily available. + */ + if (stbuf) { + dst_statfs_blocks = ((dst_statfs.f_bavail * + dst_statfs.f_bsize) / + GF_DISK_SECTOR_SIZE); + src_statfs_blocks = ((src_statfs.f_bavail * + src_statfs.f_bsize) / + GF_DISK_SECTOR_SIZE); + if ((dst_statfs_blocks - stbuf->ia_blocks) < + (src_statfs_blocks + stbuf->ia_blocks)) { + gf_log (this->name, GF_LOG_WARNING, + "data movement attempted from node (%s) with" + " higher disk space to a node (%s) with " + "lesser disk space (%s)", from->name, + to->name, loc->path); + + /* this is not a 'failure', but we don't want to + consider this as 'success' too :-/ */ + ret = 1; + goto out; + } + } +check_avail_space: + if (((dst_statfs.f_bavail * dst_statfs.f_bsize) / + GF_DISK_SECTOR_SIZE) < stbuf->ia_blocks) { + gf_log (this->name, GF_LOG_ERROR, + "data movement attempted from node (%s) with " + "to node (%s) which does not have required free space" + " for %s", from->name, to->name, loc->path); + ret = 1; goto out; } @@ -228,7 +417,7 @@ out: } static inline int -__dht_rebalane_migrate_data (xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst, +__dht_rebalance_migrate_data (xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst, uint64_t ia_size, int hole_exists) { int ret = 0; @@ -244,7 +433,7 @@ __dht_rebalane_migrate_data (xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst, read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE) ? DHT_REBALANCE_BLKSIZE : (ia_size - total)); ret = syncop_readv (from, src, read_size, - offset, &vector, &count, &iobref); + offset, 0, &vector, &count, &iobref); if (!ret || (ret < 0)) { break; } @@ -254,15 +443,14 @@ __dht_rebalane_migrate_data (xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst, ret, offset, iobref); else ret = syncop_writev (to, dst, vector, count, - offset, iobref); + offset, iobref, 0); if (ret < 0) { break; } offset += ret; total += ret; - if (vector) - GF_FREE (vector); + GF_FREE (vector); if (iobref) iobref_unref (iobref); iobref = NULL; @@ -270,8 +458,7 @@ __dht_rebalane_migrate_data (xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst, } if (iobref) iobref_unref (iobref); - if (vector) - GF_FREE (vector); + GF_FREE (vector); if (ret >= 0) ret = 0; @@ -289,8 +476,10 @@ __dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc, dict_t *dict = NULL; xlator_t *this = NULL; struct iatt iatt = {0,}; + dht_conf_t *conf = NULL; this = THIS; + conf = this->private; fd = fd_create (loc->inode, DHT_REBALANCE_PID); if (!fd) { @@ -303,8 +492,8 @@ __dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc, ret = syncop_open (from, loc, O_RDWR, fd); if (ret == -1) { gf_log (this->name, GF_LOG_ERROR, - "failed to open file %s on %s", - loc->path, from->name); + "failed to open file %s on %s (%s)", + loc->path, from->name, strerror (errno)); goto out; } @@ -313,7 +502,7 @@ __dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc, if (!dict) goto out; - ret = dict_set_str (dict, DHT_LINKFILE_KEY, to->name); + ret = dict_set_str (dict, conf->link_xattr_name, to->name); if (ret) { gf_log (this->name, GF_LOG_ERROR, "failed to set xattr in dict for %s (linkto:%s)", @@ -326,8 +515,8 @@ __dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc, ret = syncop_setxattr (from, loc, dict, 0); if (ret) { gf_log (this->name, GF_LOG_ERROR, - "failed to set xattr on %s in %s", - loc->path, from->name); + "failed to set xattr on %s in %s (%s)", + loc->path, from->name, strerror (errno)); goto out; } @@ -340,8 +529,8 @@ __dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc, ret = syncop_setattr (from, loc, &iatt, GF_SET_ATTR_MODE, NULL, NULL); if (ret) { gf_log (this->name, GF_LOG_ERROR, - "failed to set mode on %s in %s", - loc->path, from->name); + "failed to set mode on %s in %s (%s)", + loc->path, from->name, strerror (errno)); goto out; } @@ -358,6 +547,132 @@ out: } int +migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc, + struct iatt *buf) +{ + int ret = -1; + dict_t *rsp_dict = NULL; + dict_t *dict = NULL; + char *link = NULL; + struct iatt stbuf = {0,}; + dht_conf_t *conf = this->private; + + dict = dict_new (); + if (!dict) + goto out; + + ret = dict_set_int32 (dict, conf->link_xattr_name, 256); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set 'linkto' key in dict", loc->path); + goto out; + } + + /* check in the destination if the file is link file */ + ret = syncop_lookup (to, loc, dict, &stbuf, &rsp_dict, NULL); + if ((ret == -1) && (errno != ENOENT)) { + gf_log (this->name, GF_LOG_WARNING, "%s: lookup failed (%s)", + loc->path, strerror (errno)); + goto out; + } + + /* we no more require this key */ + dict_del (dict, conf->link_xattr_name); + + /* file exists in target node, only if it is 'linkfile' its valid, + otherwise, error out */ + if (!ret) { + if (!check_is_linkfile (loc->inode, &stbuf, rsp_dict, + conf->link_xattr_name)) { + gf_log (this->name, GF_LOG_WARNING, + "%s: file exists in destination", loc->path); + ret = -1; + goto out; + } + + /* as file is linkfile, delete it */ + ret = syncop_unlink (to, loc); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to delete the linkfile (%s)", + loc->path, strerror (errno)); + goto out; + } + } + + /* Set the gfid of the source file in dict */ + ret = dict_set_static_bin (dict, "gfid-req", buf->ia_gfid, 16); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set gfid in dict for create", loc->path); + goto out; + } + + /* Create the file in target */ + if (IA_ISLNK (buf->ia_type)) { + /* Handle symlinks separately */ + ret = syncop_readlink (from, loc, &link, buf->ia_size); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: readlink on symlink failed (%s)", + loc->path, strerror (errno)); + goto out; + } + + ret = syncop_symlink (to, loc, link, dict, 0); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: creating symlink failed (%s)", + loc->path, strerror (errno)); + goto out; + } + + goto done; + } + + ret = syncop_mknod (to, loc, st_mode_from_ia (buf->ia_prot, + buf->ia_type), + makedev (ia_major (buf->ia_rdev), + ia_minor (buf->ia_rdev)), dict, 0); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "%s: mknod failed (%s)", + loc->path, strerror (errno)); + goto out; + } + +done: + ret = syncop_setattr (to, loc, buf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID | + GF_SET_ATTR_MODE), NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform setattr on %s (%s)", + loc->path, to->name, strerror (errno)); + } + + ret = syncop_unlink (from, loc); + if (ret) + gf_log (this->name, GF_LOG_WARNING, "%s: unlink failed (%s)", + loc->path, strerror (errno)); + +out: + if (dict) + dict_unref (dict); + + if (rsp_dict) + dict_unref (rsp_dict); + + return ret; +} + +/* + return values: + + -1 : failure + 0 : successfully migrated data + 1 : not a failure, but we can't migrate data as of now +*/ +int dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, int flag) { @@ -365,12 +680,14 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, struct iatt new_stbuf = {0,}; struct iatt stbuf = {0,}; struct iatt empty_iatt = {0,}; + ia_prot_t src_ia_prot = {0,}; fd_t *src_fd = NULL; fd_t *dst_fd = NULL; dict_t *dict = NULL; dict_t *xattr = NULL; - dict_t *rsp_dict = NULL; + dict_t *xattr_rsp = NULL; int file_has_holes = 0; + dht_conf_t *conf = this->private; gf_log (this->name, GF_LOG_INFO, "%s: attempting to move from %s to %s", loc->path, from->name, to->name); @@ -379,35 +696,55 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, if (!dict) goto out; - ret = dict_set_int32 (dict, GLUSTERFS_OPEN_FD_COUNT, 4); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set fd-count key in dict, may attempt " - "migration of file which has open fds", loc->path); + ret = dict_set_int32 (dict, conf->link_xattr_name, 256); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set 'linkto' key in dict", loc->path); + goto out; + } /* Phase 1 - Data migration is in progress from now on */ - ret = syncop_lookup (from, loc, dict, &stbuf, &rsp_dict, NULL); + ret = syncop_lookup (from, loc, dict, &stbuf, &xattr_rsp, NULL); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "failed to lookup %s on %s", - loc->path, from->name); + gf_log (this->name, GF_LOG_ERROR, "%s: lookup failed on %s (%s)", + loc->path, from->name, strerror (errno)); goto out; } + /* we no more require this key */ + dict_del (dict, conf->link_xattr_name); + + /* preserve source mode, so set the same to the destination */ + src_ia_prot = stbuf.ia_prot; + /* Check if file can be migrated */ - ret = __is_file_migratable (this, loc, rsp_dict, &stbuf); + ret = __is_file_migratable (this, loc, &stbuf, xattr_rsp, flag); if (ret) goto out; + /* Take care of the special files */ + if (!IA_ISREG (stbuf.ia_type)) { + /* Special files */ + ret = migrate_special_files (this, from, to, loc, &stbuf); + goto out; + } + + /* TODO: move all xattr related operations to fd based operations */ + ret = syncop_listxattr (from, loc, &xattr); + if (ret == -1) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to get xattr from %s (%s)", + loc->path, from->name, strerror (errno)); + /* create the destination, with required modes/xattr */ - ret = __dht_rebalance_create_dst_file (to, loc, &stbuf, dict, &dst_fd); + ret = __dht_rebalance_create_dst_file (to, from, loc, &stbuf, + dict, &dst_fd, xattr); if (ret) goto out; - /* Should happen on all files when 'force' option is not given */ - if (flag != DHT_MIGRATE_EVEN_IF_LINK_EXISTS) { - ret = __dht_check_free_space (to, from, loc, &stbuf); - if (ret) - goto out; + ret = __dht_check_free_space (to, from, loc, &stbuf, flag); + if (ret) { + goto out; } /* Open the source, and also update mode/xattr */ @@ -418,10 +755,11 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } + ret = syncop_fstat (from, src_fd, &stbuf); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "failed to lookup %s on %s", - loc->path, from->name); + gf_log (this->name, GF_LOG_ERROR, "failed to lookup %s on %s (%s)", + loc->path, from->name, strerror (errno)); goto out; } @@ -430,31 +768,30 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, file_has_holes = 1; /* All I/O happens in this function */ - ret = __dht_rebalane_migrate_data (from, to, src_fd, dst_fd, - stbuf.ia_size, file_has_holes); + ret = __dht_rebalance_migrate_data (from, to, src_fd, dst_fd, + stbuf.ia_size, file_has_holes); if (ret) { gf_log (this->name, GF_LOG_ERROR, "%s: failed to migrate data", loc->path); + /* reset the destination back to 0 */ + ret = syncop_ftruncate (to, dst_fd, 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to reset target size back to 0 (%s)", + loc->path, strerror (errno)); + } + + ret = -1; goto out; } - /* TODO: move all xattr related operations to fd based operations */ - ret = syncop_listxattr (from, loc, &xattr); - if (ret == -1) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to get xattr from %s", loc->path, from->name); - - ret = syncop_setxattr (to, loc, xattr, 0); - if (ret == -1) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set xattr on %s", loc->path, to->name); - /* TODO: Sync the locks */ - ret = syncop_fsync (to, dst_fd); + ret = syncop_fsync (to, dst_fd, 0); if (ret) gf_log (this->name, GF_LOG_WARNING, - "%s: failed to fsync on %s", loc->path, to->name); + "%s: failed to fsync on %s (%s)", + loc->path, to->name, strerror (errno)); /* Phase 2 - Data-Migration Complete, Housekeeping updates pending */ @@ -463,15 +800,19 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, if (ret < 0) { /* Failed to get the stat info */ gf_log (this->name, GF_LOG_ERROR, - "failed to fstat file %s on %s", - loc->path, from->name); + "failed to fstat file %s on %s (%s)", + loc->path, from->name, strerror (errno)); goto out; } /* source would have both sticky bit and sgid bit set, reset it to 0, - and set the source permission on destination */ - new_stbuf.ia_prot.sticky = 0; - new_stbuf.ia_prot.sgid = 0; + and set the source permission on destination, if it was not set + prior to setting rebalance-modes in source */ + if (!src_ia_prot.sticky) + new_stbuf.ia_prot.sticky = 0; + + if (!src_ia_prot.sgid) + new_stbuf.ia_prot.sgid = 0; /* TODO: if the source actually had sticky bit, or sgid bit set, we are not handling it */ @@ -481,8 +822,9 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, GF_SET_ATTR_MODE), NULL, NULL); if (ret) { gf_log (this->name, GF_LOG_WARNING, - "%s: failed to perform setattr on %s", - loc->path, to->name); + "%s: failed to perform setattr on %s (%s)", + loc->path, to->name, strerror (errno)); + goto out; } /* Because 'futimes' is not portable */ @@ -491,8 +833,8 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, NULL, NULL); if (ret) { gf_log (this->name, GF_LOG_WARNING, - "%s: failed to perform setattr on %s", - loc->path, to->name); + "%s: failed to perform setattr on %s (%s)", + loc->path, to->name, strerror (errno)); } /* Make the source as a linkfile first before deleting it */ @@ -501,50 +843,53 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, GF_SET_ATTR_MODE, NULL, NULL); if (ret) { gf_log (this->name, GF_LOG_WARNING, \ - "%s: failed to perform setattr on %s", - loc->path, from->name); + "%s: failed to perform setattr on %s (%s)", + loc->path, from->name, strerror (errno)); + goto out; + } + + /* Free up the data blocks on the source node, as the whole + file is migrated */ + ret = syncop_ftruncate (from, src_fd, 0); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform truncate on %s (%s)", + loc->path, from->name, strerror (errno)); + } + + /* remove the 'linkto' xattr from the destination */ + ret = syncop_fremovexattr (to, dst_fd, conf->link_xattr_name); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform removexattr on %s (%s)", + loc->path, to->name, strerror (errno)); } /* Do a stat and check the gfid before unlink */ ret = syncop_stat (from, loc, &empty_iatt); if (ret) { gf_log (this->name, GF_LOG_WARNING, - "%s: failed to do a stat on %s", - loc->path, from->name); + "%s: failed to do a stat on %s (%s)", + loc->path, from->name, strerror (errno)); + goto out; } - if (uuid_compare (empty_iatt.ia_gfid, loc->inode->gfid) == 0) { + if (uuid_compare (empty_iatt.ia_gfid, loc->gfid) == 0) { /* take out the source from namespace */ ret = syncop_unlink (from, loc); if (ret) { gf_log (this->name, GF_LOG_WARNING, - "%s: failed to perform unlink on %s", - loc->path, from->name); + "%s: failed to perform unlink on %s (%s)", + loc->path, from->name, strerror (errno)); + goto out; } } - /* Free up the data blocks on the source node, as the whole - file is migrated */ - ret = syncop_ftruncate (from, src_fd, 0); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to perform truncate on %s", - loc->path, from->name); - } - - /* remove the 'linkto' xattr from the destination */ - ret = syncop_removexattr (to, loc, DHT_LINKFILE_KEY); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to perform removexattr on %s", - loc->path, to->name); - } - ret = syncop_lookup (this, loc, NULL, NULL, NULL, NULL); if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to lookup the file on subvolumes", - loc->path); + gf_log (this->name, GF_LOG_DEBUG, + "%s: failed to lookup the file on subvolumes (%s)", + loc->path, strerror (errno)); } gf_log (this->name, GF_LOG_INFO, @@ -556,6 +901,11 @@ out: if (dict) dict_unref (dict); + if (xattr) + dict_unref (xattr); + if (xattr_rsp) + dict_unref (xattr_rsp); + if (dst_fd) syncop_close (dst_fd); if (src_fd) @@ -591,6 +941,7 @@ rebalance_task_completion (int op_ret, call_frame_t *sync_frame, void *data) dht_layout_t *layout = 0; xlator_t *this = NULL; dht_local_t *local = NULL; + int32_t op_errno = EINVAL; this = THIS; local = sync_frame->local; @@ -611,9 +962,24 @@ rebalance_task_completion (int op_ret, call_frame_t *sync_frame, void *data) "%s: failed to set inode ctx", local->loc.path); } - /* if success, errno is not checked, - if ret is -1, then let errno be 'ENOTSUP' */ - DHT_STACK_UNWIND (setxattr, sync_frame, op_ret, ENOTSUP); + if (op_ret == -1) { + /* Failure of migration process, mostly due to write process. + as we can't preserve the exact errno, lets say there was + no space to migrate-data + */ + op_errno = ENOSPC; + } + + if (op_ret == 1) { + /* migration didn't happen, but is not a failure, let the user + understand that he doesn't have permission to migrate the + file. + */ + op_ret = -1; + op_errno = EPERM; + } + + DHT_STACK_UNWIND (setxattr, sync_frame, op_ret, op_errno, NULL); return 0; } @@ -621,12 +987,829 @@ int dht_start_rebalance_task (xlator_t *this, call_frame_t *frame) { int ret = -1; - dht_conf_t *conf = NULL; - - conf = this->private; - ret = synctask_new (conf->env, rebalance_task, + ret = synctask_new (this->ctx->env, rebalance_task, rebalance_task_completion, frame, frame); return ret; } + +int +gf_listener_stop (xlator_t *this) +{ + glusterfs_ctx_t *ctx = NULL; + cmd_args_t *cmd_args = NULL; + int ret = 0; + + ctx = this->ctx; + GF_ASSERT (ctx); + cmd_args = &ctx->cmd_args; + if (cmd_args->sock_file) { + ret = unlink (cmd_args->sock_file); + if (ret && (ENOENT == errno)) { + ret = 0; + } + } + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to unlink listener " + "socket %s, error: %s", cmd_args->sock_file, + strerror (errno)); + } + return ret; +} + +void +dht_build_root_inode (xlator_t *this, inode_t **inode) +{ + inode_table_t *itable = NULL; + uuid_t root_gfid = {0, }; + + itable = inode_table_new (0, this); + if (!itable) + return; + + root_gfid[15] = 1; + *inode = inode_find (itable, root_gfid); +} + +void +dht_build_root_loc (inode_t *inode, loc_t *loc) +{ + loc->path = "/"; + loc->inode = inode; + loc->inode->ia_type = IA_IFDIR; + memset (loc->gfid, 0, 16); + loc->gfid[15] = 1; +} + + +/* return values: 1 -> error, bug ignore and continue + 0 -> proceed + -1 -> error, handle it */ +int32_t +gf_defrag_handle_migrate_error (int32_t op_errno, gf_defrag_info_t *defrag) +{ + /* if errno is not ENOSPC or ENOTCONN, we can still continue + with rebalance process */ + if ((errno != ENOSPC) || (errno != ENOTCONN)) + return 1; + + if (errno == ENOTCONN) { + /* Most probably mount point went missing (mostly due + to a brick down), say rebalance failure to user, + let him restart it if everything is fine */ + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + return -1; + } + + if (errno == ENOSPC) { + /* rebalance process itself failed, may be + remote brick went down, or write failed due to + disk full etc etc.. */ + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + return -1; + } + + return 0; +} + +static gf_boolean_t +gf_defrag_pattern_match (gf_defrag_info_t *defrag, char *name, uint64_t size) +{ + gf_defrag_pattern_list_t *trav = NULL; + gf_boolean_t match = _gf_false; + gf_boolean_t ret = _gf_false; + + GF_VALIDATE_OR_GOTO ("dht", defrag, out); + + trav = defrag->defrag_pattern; + while (trav) { + if (!fnmatch (trav->path_pattern, name, FNM_NOESCAPE)) { + match = _gf_true; + break; + } + trav = trav->next; + } + + if ((match == _gf_true) && (size >= trav->size)) + ret = _gf_true; + + out: + return ret; +} + +/* We do a depth first traversal of directories. But before we move into + * subdirs, we complete the data migration of those directories whose layouts + * have been fixed + */ + +int +gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, + dict_t *migrate_data) +{ + int ret = -1; + loc_t entry_loc = {0,}; + fd_t *fd = NULL; + gf_dirent_t entries; + gf_dirent_t *tmp = NULL; + gf_dirent_t *entry = NULL; + gf_boolean_t free_entries = _gf_false; + off_t offset = 0; + dict_t *dict = NULL; + struct iatt iatt = {0,}; + int32_t op_errno = 0; + char *uuid_str = NULL; + uuid_t node_uuid = {0,}; + int readdir_operrno = 0; + struct timeval dir_start = {0,}; + struct timeval end = {0,}; + double elapsed = {0,}; + struct timeval start = {0,}; + int32_t err = 0; + + gf_log (this->name, GF_LOG_INFO, "migrate data called on %s", + loc->path); + gettimeofday (&dir_start, NULL); + + fd = fd_create (loc->inode, defrag->pid); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, "Failed to create fd"); + goto out; + } + + ret = syncop_opendir (this, loc, fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to open dir %s", + loc->path); + goto out; + } + + INIT_LIST_HEAD (&entries.list); + + while ((ret = syncop_readdirp (this, fd, 131072, offset, NULL, + &entries)) != 0) { + + if (ret < 0) { + + gf_log (this->name, GF_LOG_ERROR, "Readdir returned %s." + " Aborting migrate-data", + strerror(readdir_operrno)); + goto out; + } + + /* Need to keep track of ENOENT errno, that means, there is no + need to send more readdirp() */ + readdir_operrno = errno; + + if (list_empty (&entries.list)) + break; + + free_entries = _gf_true; + + list_for_each_entry_safe (entry, tmp, &entries.list, list) { + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { + ret = 1; + goto out; + } + + offset = entry->d_off; + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + if (IA_ISDIR (entry->d_stat.ia_type)) + continue; + + defrag->num_files_lookedup++; + if (defrag->stats == _gf_true) { + gettimeofday (&start, NULL); + } + if (defrag->defrag_pattern && + (gf_defrag_pattern_match (defrag, entry->d_name, + entry->d_stat.ia_size) + == _gf_false)) { + continue; + } + loc_wipe (&entry_loc); + ret =dht_build_child_loc (this, &entry_loc, loc, + entry->d_name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Child loc" + " build failed"); + goto out; + } + + if (uuid_is_null (entry->d_stat.ia_gfid)) { + gf_log (this->name, GF_LOG_ERROR, "%s/%s" + " gfid not present", loc->path, + entry->d_name); + continue; + } + + uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid); + + if (uuid_is_null (loc->gfid)) { + gf_log (this->name, GF_LOG_ERROR, "%s/%s" + " gfid not present", loc->path, + entry->d_name); + continue; + } + + uuid_copy (entry_loc.pargfid, loc->gfid); + + entry_loc.inode->ia_type = entry->d_stat.ia_type; + + ret = syncop_lookup (this, &entry_loc, NULL, &iatt, + NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s" + " lookup failed", entry_loc.path); + continue; + } + + ret = syncop_getxattr (this, &entry_loc, &dict, + GF_XATTR_NODE_UUID_KEY); + if(ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "get node-uuid for %s", entry_loc.path); + continue; + } + + ret = dict_get_str (dict, GF_XATTR_NODE_UUID_KEY, + &uuid_str); + if(ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "get node-uuid from dict for %s", + entry_loc.path); + continue; + } + + if (uuid_parse (uuid_str, node_uuid)) { + gf_log (this->name, GF_LOG_ERROR, "uuid_parse " + "failed for %s", entry_loc.path); + continue; + } + + /* if file belongs to different node, skip migration + * the other node will take responsibility of migration + */ + if (uuid_compare (node_uuid, defrag->node_uuid)) { + gf_log (this->name, GF_LOG_TRACE, "%s does not" + "belong to this node", entry_loc.path); + continue; + } + + uuid_str = NULL; + + dict_del (dict, GF_XATTR_NODE_UUID_KEY); + + + /* if distribute is present, it will honor this key. + * -1 is returned if distribute is not present or file + * doesn't have a link-file. If file has link-file, the + * path of link-file will be the value, and also that + * guarantees that file has to be mostly migrated */ + + ret = syncop_getxattr (this, &entry_loc, &dict, + GF_XATTR_LINKINFO_KEY); + if (ret < 0) { + gf_log (this->name, GF_LOG_TRACE, "failed to " + "get link-to key for %s", + entry_loc.path); + continue; + } + + ret = syncop_setxattr (this, &entry_loc, migrate_data, + 0); + if (ret) { + err = op_errno; + /* errno is overloaded. See + * rebalance_task_completion () */ + if (err != ENOSPC) { + gf_log (this->name, GF_LOG_DEBUG, + "migrate-data skipped for %s" + " due to space constraints", + entry_loc.path); + defrag->skipped +=1; + } else{ + gf_log (this->name, GF_LOG_ERROR, + "migrate-data failed for %s", + entry_loc.path); + defrag->total_failures +=1; + } + } + + if (ret == -1) { + op_errno = errno; + ret = gf_defrag_handle_migrate_error (op_errno, + defrag); + + if (!ret) + gf_log (this->name, GF_LOG_DEBUG, + "migrate-data on %s failed: %s", + entry_loc.path, + strerror (op_errno)); + else if (ret == 1) + continue; + else if (ret == -1) + goto out; + } + + LOCK (&defrag->lock); + { + defrag->total_files += 1; + defrag->total_data += iatt.ia_size; + } + UNLOCK (&defrag->lock); + if (defrag->stats == _gf_true) { + gettimeofday (&end, NULL); + elapsed = (end.tv_sec - start.tv_sec) * 1e6 + + (end.tv_usec - start.tv_usec); + gf_log (this->name, GF_LOG_INFO, "Migration of " + "file:%s size:%"PRIu64" bytes took %.2f" + "secs", entry_loc.path, iatt.ia_size, + elapsed/1e6); + } + } + + gf_dirent_free (&entries); + free_entries = _gf_false; + INIT_LIST_HEAD (&entries.list); + + if (readdir_operrno == ENOENT) + break; + } + + gettimeofday (&end, NULL); + elapsed = (end.tv_sec - dir_start.tv_sec) * 1e6 + + (end.tv_usec - dir_start.tv_usec); + gf_log (this->name, GF_LOG_INFO, "Migration operation on dir %s took " + "%.2f secs", loc->path, elapsed/1e6); + ret = 0; +out: + if (free_entries) + gf_dirent_free (&entries); + + loc_wipe (&entry_loc); + + if (dict) + dict_unref(dict); + + if (fd) + fd_unref (fd); + return ret; + +} + + +int +gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, + dict_t *fix_layout, dict_t *migrate_data) +{ + int ret = -1; + loc_t entry_loc = {0,}; + fd_t *fd = NULL; + gf_dirent_t entries; + gf_dirent_t *tmp = NULL; + gf_dirent_t *entry = NULL; + gf_boolean_t free_entries = _gf_false; + dict_t *dict = NULL; + off_t offset = 0; + struct iatt iatt = {0,}; + int readdirp_errno = 0; + + ret = syncop_lookup (this, loc, NULL, &iatt, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Lookup failed on %s", + loc->path); + goto out; + } + + if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) { + ret = gf_defrag_migrate_data (this, defrag, loc, migrate_data); + if (ret) + goto out; + } + + gf_log (this->name, GF_LOG_TRACE, "fix layout called on %s", loc->path); + + fd = fd_create (loc->inode, defrag->pid); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, "Failed to create fd"); + ret = -1; + goto out; + } + + ret = syncop_opendir (this, loc, fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to open dir %s", + loc->path); + ret = -1; + goto out; + } + + INIT_LIST_HEAD (&entries.list); + while ((ret = syncop_readdirp (this, fd, 131072, offset, NULL, + &entries)) != 0) + { + + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Readdir returned %s" + ". Aborting fix-layout",strerror(errno)); + goto out; + } + + /* Need to keep track of ENOENT errno, that means, there is no + need to send more readdirp() */ + readdirp_errno = errno; + + if (list_empty (&entries.list)) + break; + + free_entries = _gf_true; + + list_for_each_entry_safe (entry, tmp, &entries.list, list) { + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { + ret = 1; + goto out; + } + + offset = entry->d_off; + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + if (!IA_ISDIR (entry->d_stat.ia_type)) + continue; + + loc_wipe (&entry_loc); + ret =dht_build_child_loc (this, &entry_loc, loc, + entry->d_name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Child loc" + " build failed"); + goto out; + } + + if (uuid_is_null (entry->d_stat.ia_gfid)) { + gf_log (this->name, GF_LOG_ERROR, "%s/%s" + " gfid not present", loc->path, + entry->d_name); + continue; + } + + entry_loc.inode->ia_type = entry->d_stat.ia_type; + + uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid); + if (uuid_is_null (loc->gfid)) { + gf_log (this->name, GF_LOG_ERROR, "%s/%s" + " gfid not present", loc->path, + entry->d_name); + continue; + } + + uuid_copy (entry_loc.pargfid, loc->gfid); + + ret = syncop_lookup (this, &entry_loc, NULL, &iatt, + NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s" + " lookup failed", entry_loc.path); + continue; + } + + ret = syncop_setxattr (this, &entry_loc, fix_layout, + 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Setxattr " + "failed for %s", entry_loc.path); + defrag->defrag_status = + GF_DEFRAG_STATUS_FAILED; + defrag->total_failures ++; + goto out; + } + ret = gf_defrag_fix_layout (this, defrag, &entry_loc, + fix_layout, migrate_data); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Fix layout " + "failed for %s", entry_loc.path); + defrag->total_failures++; + goto out; + } + + } + gf_dirent_free (&entries); + free_entries = _gf_false; + INIT_LIST_HEAD (&entries.list); + if (readdirp_errno == ENOENT) + break; + } + + ret = 0; +out: + if (free_entries) + gf_dirent_free (&entries); + + loc_wipe (&entry_loc); + + if (dict) + dict_unref(dict); + + if (fd) + fd_unref (fd); + + return ret; + +} + + +int +gf_defrag_start_crawl (void *data) +{ + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + int ret = -1; + loc_t loc = {0,}; + struct iatt iatt = {0,}; + struct iatt parent = {0,}; + dict_t *fix_layout = NULL; + dict_t *migrate_data = NULL; + dict_t *status = NULL; + glusterfs_ctx_t *ctx = NULL; + + this = data; + if (!this) + goto out; + + ctx = this->ctx; + if (!ctx) + goto out; + + conf = this->private; + if (!conf) + goto out; + + defrag = conf->defrag; + if (!defrag) + goto out; + + gettimeofday (&defrag->start_time, NULL); + dht_build_root_inode (this, &defrag->root_inode); + if (!defrag->root_inode) + goto out; + + dht_build_root_loc (defrag->root_inode, &loc); + + /* fix-layout on '/' first */ + + ret = syncop_lookup (this, &loc, NULL, &iatt, NULL, &parent); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "look up on / failed"); + goto out; + } + + fix_layout = dict_new (); + if (!fix_layout) { + ret = -1; + goto out; + } + + ret = dict_set_str (fix_layout, GF_XATTR_FIX_LAYOUT_KEY, "yes"); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set dict str"); + goto out; + } + + ret = syncop_setxattr (this, &loc, fix_layout, 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "fix layout on %s failed", + loc.path); + defrag->total_failures++; + goto out; + } + + if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) { + migrate_data = dict_new (); + if (!migrate_data) { + ret = -1; + goto out; + } + if (defrag->cmd == GF_DEFRAG_CMD_START_FORCE) + ret = dict_set_str (migrate_data, + "distribute.migrate-data", "force"); + else + ret = dict_set_str (migrate_data, + "distribute.migrate-data", + "non-force"); + if (ret) + goto out; + } + ret = gf_defrag_fix_layout (this, defrag, &loc, fix_layout, + migrate_data); + if ((defrag->defrag_status != GF_DEFRAG_STATUS_STOPPED) && + (defrag->defrag_status != GF_DEFRAG_STATUS_FAILED)) { + defrag->defrag_status = GF_DEFRAG_STATUS_COMPLETE; + } + + + +out: + LOCK (&defrag->lock); + { + status = dict_new (); + gf_defrag_status_get (defrag, status); + if (ctx->notify) + ctx->notify (GF_EN_DEFRAG_STATUS, status); + if (status) + dict_unref (status); + defrag->is_exiting = 1; + } + UNLOCK (&defrag->lock); + + if (defrag) { + GF_FREE (defrag); + conf->defrag = NULL; + } + + return ret; +} + + +static int +gf_defrag_done (int ret, call_frame_t *sync_frame, void *data) +{ + gf_listener_stop (sync_frame->this); + + STACK_DESTROY (sync_frame->root); + kill (getpid(), SIGTERM); + return 0; +} + +void * +gf_defrag_start (void *data) +{ + int ret = -1; + call_frame_t *frame = NULL; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + xlator_t *this = NULL; + + this = data; + conf = this->private; + if (!conf) + goto out; + + defrag = conf->defrag; + if (!defrag) + goto out; + + frame = create_frame (this, this->ctx->pool); + if (!frame) + goto out; + + frame->root->pid = GF_CLIENT_PID_DEFRAG; + + defrag->pid = frame->root->pid; + + defrag->defrag_status = GF_DEFRAG_STATUS_STARTED; + + ret = synctask_new (this->ctx->env, gf_defrag_start_crawl, + gf_defrag_done, frame, this); + + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Could not create" + " task for rebalance"); +out: + return NULL; +} + +int +gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict) +{ + int ret = 0; + uint64_t files = 0; + uint64_t size = 0; + uint64_t lookup = 0; + uint64_t failures = 0; + uint64_t skipped = 0; + char *status = ""; + double elapsed = 0; + struct timeval end = {0,}; + + + if (!defrag) + goto out; + + ret = 0; + if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) + goto out; + + files = defrag->total_files; + size = defrag->total_data; + lookup = defrag->num_files_lookedup; + failures = defrag->total_failures; + skipped = defrag->skipped; + + gettimeofday (&end, NULL); + + elapsed = end.tv_sec - defrag->start_time.tv_sec; + + if (!dict) + goto log; + + ret = dict_set_uint64 (dict, "files", files); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set file count"); + + ret = dict_set_uint64 (dict, "size", size); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set size of xfer"); + + ret = dict_set_uint64 (dict, "lookups", lookup); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set lookedup file count"); + + + ret = dict_set_int32 (dict, "status", defrag->defrag_status); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set status"); + if (elapsed) { + ret = dict_set_double (dict, "run-time", elapsed); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set run-time"); + } + + ret = dict_set_uint64 (dict, "failures", failures); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set failure count"); + + ret = dict_set_uint64 (dict, "skipped", skipped); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set skipped file count"); +log: + switch (defrag->defrag_status) { + case GF_DEFRAG_STATUS_NOT_STARTED: + status = "not started"; + break; + case GF_DEFRAG_STATUS_STARTED: + status = "in progress"; + break; + case GF_DEFRAG_STATUS_STOPPED: + status = "stopped"; + break; + case GF_DEFRAG_STATUS_COMPLETE: + status = "completed"; + break; + case GF_DEFRAG_STATUS_FAILED: + status = "failed"; + break; + default: + break; + } + + gf_log (THIS->name, GF_LOG_INFO, "Rebalance is %s. Time taken is %.2f " + "secs", status, elapsed); + gf_log (THIS->name, GF_LOG_INFO, "Files migrated: %"PRIu64", size: %" + PRIu64", lookups: %"PRIu64", failures: %"PRIu64", skipped: " + "%"PRIu64, files, size, lookup, failures, skipped); + + +out: + return 0; +} + +int +gf_defrag_stop (gf_defrag_info_t *defrag, dict_t *output) +{ + /* TODO: set a variable 'stop_defrag' here, it should be checked + in defrag loop */ + int ret = -1; + GF_ASSERT (defrag); + + if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) { + goto out; + } + + gf_log ("", GF_LOG_INFO, "Received stop command on rebalance"); + defrag->defrag_status = GF_DEFRAG_STATUS_STOPPED; + + if (output) + gf_defrag_status_get (defrag, output); + ret = 0; +out: + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c index 226ce280d..5d6f4f232 100644 --- a/xlators/cluster/dht/src/dht-rename.c +++ b/xlators/cluster/dht/src/dht-rename.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ /* TODO: link(oldpath, newpath) fails if newpath already exists. DHT should @@ -35,7 +26,8 @@ int dht_rename_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *stbuf, struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; @@ -80,10 +72,11 @@ unwind: WIPE (&local->preparent); WIPE (&local->postparent); + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, &local->stbuf, &local->preoldparent, &local->postoldparent, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, xdata); } return 0; @@ -96,7 +89,7 @@ dht_rename_hashed_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *preoldparent, struct iatt *postoldparent, struct iatt *prenewparent, - struct iatt *postnewparent) + struct iatt *postnewparent, dict_t *xdata) { dht_conf_t *conf = NULL; dht_local_t *local = NULL; @@ -146,7 +139,7 @@ dht_rename_hashed_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STACK_WIND (frame, dht_rename_dir_cbk, conf->subvolumes[i], conf->subvolumes[i]->fops->rename, - &local->loc, &local->loc2); + &local->loc, &local->loc2, NULL); if (!--call_cnt) break; } @@ -159,10 +152,11 @@ unwind: WIPE (&local->preparent); WIPE (&local->postparent); + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, &local->stbuf, &local->preoldparent, &local->postoldparent, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, NULL); return 0; } @@ -183,19 +177,20 @@ dht_rename_dir_do (call_frame_t *frame, xlator_t *this) STACK_WIND (frame, dht_rename_hashed_dir_cbk, local->dst_hashed, local->dst_hashed->fops->rename, - &local->loc, &local->loc2); + &local->loc, &local->loc2, NULL); return 0; err: DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, NULL, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); return 0; } int dht_rename_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *entries) + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = -1; @@ -224,7 +219,7 @@ dht_rename_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int dht_rename_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd) + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = -1; @@ -244,7 +239,7 @@ dht_rename_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STACK_WIND (frame, dht_rename_readdir_cbk, prev->this, prev->this->fops->readdir, - local->fd, 4096, 0); + local->fd, 4096, 0, NULL); return 0; @@ -300,22 +295,54 @@ dht_rename_dir (call_frame_t *frame, xlator_t *this) STACK_WIND (frame, dht_rename_opendir_cbk, conf->subvolumes[i], conf->subvolumes[i]->fops->opendir, - &local->loc2, local->fd); + &local->loc2, local->fd, NULL); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); return 0; } +#define DHT_MARK_FOP_INTERNAL(xattr) do { \ + int tmp = -1; \ + if (!xattr) { \ + xattr = dict_new (); \ + if (!xattr) \ + break; \ + } \ + tmp = dict_set_str (xattr, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); \ + if (tmp) { \ + gf_log (this->name, GF_LOG_ERROR, "Failed to set" \ + " internal dict key for %s", local->loc.path); \ + } \ + }while (0) +int +dht_rename_done (call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + local = frame->local; + + if (local->linked == _gf_true) { + local->linked = _gf_false; + dht_linkfile_attr_heal (frame, this); + } + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); + DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, + &local->stbuf, &local->preoldparent, + &local->postoldparent, &local->preparent, + &local->postparent, NULL); + + return 0; +} int dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -344,10 +371,7 @@ dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, WIPE (&local->postparent); if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, - &local->stbuf, &local->preoldparent, - &local->postoldparent, &local->preparent, - &local->postparent); + dht_rename_done (frame, this); } out: @@ -365,7 +389,7 @@ dht_rename_cleanup (call_frame_t *frame) xlator_t *dst_hashed = NULL; xlator_t *dst_cached = NULL; int call_cnt = 0; - + dict_t *xattr = NULL; local = frame->local; this = frame->this; @@ -389,13 +413,15 @@ dht_rename_cleanup (call_frame_t *frame) if (!call_cnt) goto nolinks; + DHT_MARK_FOP_INTERNAL (xattr); + if (dst_hashed != src_hashed && dst_hashed != src_cached) { gf_log (this->name, GF_LOG_TRACE, "unlinking linkfile %s @ %s => %s", local->loc.path, dst_hashed->name, src_cached->name); STACK_WIND (frame, dht_rename_unlink_cbk, dst_hashed, dst_hashed->fops->unlink, - &local->loc); + &local->loc, 0, xattr); } if (src_cached != dst_hashed) { @@ -404,9 +430,12 @@ dht_rename_cleanup (call_frame_t *frame) local->loc2.path, src_cached->name); STACK_WIND (frame, dht_rename_unlink_cbk, src_cached, src_cached->fops->unlink, - &local->loc2); + &local->loc2, 0, xattr); } + if (xattr) + dict_unref (xattr); + return 0; nolinks: @@ -415,10 +444,11 @@ nolinks: WIPE (&local->preparent); WIPE (&local->postparent); + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, &local->stbuf, &local->preoldparent, &local->postoldparent, &local->preparent, - &local->postparent); + &local->postparent, NULL); return 0; } @@ -426,9 +456,10 @@ nolinks: int dht_rename_links_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { call_frame_t *prev = NULL; dht_local_t *local = NULL; @@ -442,6 +473,10 @@ dht_rename_links_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->loc.path, prev->this->name, strerror (op_errno)); } + if (local->linked == _gf_true) { + local->linked = _gf_false; + dht_linkfile_attr_heal (frame, this); + } DHT_STACK_DESTROY (frame); return 0; @@ -452,7 +487,8 @@ int dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *stbuf, struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -463,6 +499,7 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, xlator_t *rename_subvol = NULL; call_frame_t *link_frame = NULL; dht_local_t *link_local = NULL; + dict_t *xattr = NULL; local = frame->local; prev = cookie; @@ -472,6 +509,8 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dst_hashed = local->dst_hashed; dst_cached = local->dst_cached; + if (local->linked == _gf_true) + FRAME_SU_UNDO (frame, dht_local_t); if (op_ret == -1) { gf_log (this->name, GF_LOG_WARNING, "%s: rename on %s failed (%s)", local->loc.path, @@ -501,15 +540,25 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, uuid_copy (link_local->gfid, local->loc.inode->gfid); dht_linkfile_create (link_frame, dht_rename_links_create_cbk, - src_cached, dst_hashed, &link_local->loc); + this, src_cached, dst_hashed, + &link_local->loc); } err: - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - dht_iatt_merge (this, &local->preoldparent, preoldparent, prev->this); - dht_iatt_merge (this, &local->postoldparent, postoldparent, prev->this); - dht_iatt_merge (this, &local->preparent, prenewparent, prev->this); - dht_iatt_merge (this, &local->postparent, postnewparent, prev->this); + /* Merge attrs only from src_cached. In case there of src_cached != + * dst_hashed, this ignores linkfile attrs. */ + if (prev->this == src_cached) { + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); + dht_iatt_merge (this, &local->preoldparent, preoldparent, + prev->this); + dht_iatt_merge (this, &local->postoldparent, postoldparent, + prev->this); + dht_iatt_merge (this, &local->preparent, prenewparent, + prev->this); + dht_iatt_merge (this, &local->postparent, postnewparent, + prev->this); + } + /* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk * is called. since rename has already happened on rename_subvol, @@ -534,6 +583,8 @@ err: if (local->call_cnt == 0) goto unwind; + DHT_MARK_FOP_INTERNAL (xattr); + if (src_cached != dst_hashed && src_cached != dst_cached) { gf_log (this->name, GF_LOG_TRACE, "deleting old src datafile %s @ %s", @@ -541,7 +592,7 @@ err: STACK_WIND (frame, dht_rename_unlink_cbk, src_cached, src_cached->fops->unlink, - &local->loc); + &local->loc, 0, xattr); } if (src_hashed != rename_subvol && src_hashed != src_cached) { @@ -551,7 +602,7 @@ err: STACK_WIND (frame, dht_rename_unlink_cbk, src_hashed, src_hashed->fops->unlink, - &local->loc); + &local->loc, 0, xattr); } if (dst_cached @@ -563,8 +614,10 @@ err: STACK_WIND (frame, dht_rename_unlink_cbk, dst_cached, dst_cached->fops->unlink, - &local->loc2); + &local->loc2, 0, xattr); } + if (xattr) + dict_unref (xattr); return 0; unwind: @@ -572,15 +625,16 @@ unwind: WIPE (&local->postoldparent); WIPE (&local->preparent); WIPE (&local->postparent); + if (xattr) + dict_unref (xattr); - DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, - &local->stbuf, &local->preoldparent, - &local->postoldparent, &local->preparent, - &local->postparent); + dht_rename_done (frame, this); return 0; cleanup: + if (xattr) + dict_unref (xattr); dht_rename_cleanup (frame); return 0; @@ -614,9 +668,11 @@ dht_do_rename (call_frame_t *frame) "renaming %s => %s (%s)", local->loc.path, local->loc2.path, rename_subvol->name); + if (local->linked == _gf_true) + FRAME_SU_DO (frame, dht_local_t); STACK_WIND (frame, dht_rename_cbk, rename_subvol, rename_subvol->fops->rename, - &local->loc, &local->loc2); + &local->loc, &local->loc2, NULL); return 0; } @@ -626,7 +682,8 @@ int dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -641,7 +698,11 @@ dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "link/file on %s failed (%s)", prev->this->name, strerror (op_errno)); local->op_ret = -1; - local->op_errno = op_errno; + if (op_errno != ENOENT) + local->op_errno = op_errno; + } else if (local->src_cached == prev->this) { + /* merge of attr returned only from linkfile creation */ + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); } this_call_cnt = dht_frame_return (frame); @@ -664,7 +725,8 @@ cleanup: int dht_rename_unlink_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -706,6 +768,7 @@ dht_rename_create_links (call_frame_t *frame) xlator_t *dst_hashed = NULL; xlator_t *dst_cached = NULL; int call_cnt = 0; + dict_t *xattr = NULL; local = frame->local; @@ -716,6 +779,7 @@ dht_rename_create_links (call_frame_t *frame) dst_hashed = local->dst_hashed; dst_cached = local->dst_cached; + DHT_MARK_FOP_INTERNAL (xattr); if (src_cached == dst_cached) { if (dst_hashed == dst_cached) @@ -727,7 +791,7 @@ dht_rename_create_links (call_frame_t *frame) STACK_WIND (frame, dht_rename_unlink_links_cbk, dst_hashed, dst_hashed->fops->unlink, - &local->loc2); + &local->loc2, 0, xattr); return 0; } @@ -744,7 +808,7 @@ dht_rename_create_links (call_frame_t *frame) "linkfile %s @ %s => %s", local->loc.path, dst_hashed->name, src_cached->name); memcpy (local->gfid, local->loc.inode->gfid, 16); - dht_linkfile_create (frame, dht_rename_links_cbk, + dht_linkfile_create (frame, dht_rename_links_cbk, this, src_cached, dst_hashed, &local->loc); } @@ -754,7 +818,7 @@ dht_rename_create_links (call_frame_t *frame) local->loc2.path, src_cached->name); STACK_WIND (frame, dht_rename_links_cbk, src_cached, src_cached->fops->link, - &local->loc, &local->loc2); + &local->loc, &local->loc2, xattr); } nolinks: @@ -762,6 +826,8 @@ nolinks: /* skip to next step */ dht_do_rename (frame); } + if (xattr) + dict_unref (xattr); return 0; } @@ -769,7 +835,7 @@ nolinks: int dht_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) + loc_t *oldloc, loc_t *newloc, dict_t *xdata) { xlator_t *src_cached = NULL; xlator_t *src_hashed = NULL; @@ -851,7 +917,8 @@ dht_rename (call_frame_t *frame, xlator_t *this, err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); return 0; } diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index 1c881be39..3fe96b1c7 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -26,7 +17,7 @@ #include "glusterfs.h" #include "xlator.h" #include "dht-common.h" - +#include "glusterfs-acl.h" #define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,cnt,path) do { \ layout->list[i].start = srt; \ @@ -38,43 +29,40 @@ layout->list[i].xlator->name, path); \ } while (0) +#define DHT_RESET_LAYOUT_RANGE(layout) do { \ + int cnt = 0; \ + for (cnt = 0; cnt < layout->cnt; cnt++ ) { \ + layout->list[cnt].start = 0; \ + layout->list[cnt].stop = 0; \ + } \ + } while (0) -static inline uint32_t -dht_find_overlap (int idx, int cnk_idx, uint32_t start, uint32_t stop, - uint32_t chunk_size) +static uint32_t +dht_overlap_calc (dht_layout_t *old, int o, dht_layout_t *new, int n) { - uint32_t overlap = 0; - uint32_t chunk_begin = 0; + if (o >= old->cnt || n >= new->cnt) + return 0; - chunk_begin = cnk_idx * chunk_size; + if (old->list[o].err > 0 || new->list[n].err > 0) + return 0; - /* There is no chance of overlap */ - if ((chunk_begin > stop) || - ((chunk_begin + chunk_size) < start)) - goto out; - - if ((chunk_begin <= start) && - ((chunk_begin + chunk_size) <= stop)) { - overlap = ((chunk_begin + chunk_size) - start); - goto out; + if (old->list[o].start == old->list[o].stop) { + return 0; } - if ((chunk_begin <= start) && - ((chunk_begin + chunk_size) >= stop)) { - overlap = (stop - start); - goto out; + if (new->list[n].start == new->list[n].stop) { + return 0; } - if ((chunk_begin < stop) && - ((chunk_begin + chunk_size) >= stop)) { - overlap = (stop - chunk_begin); - goto out; - } + if ((old->list[o].start > new->list[n].stop) || + (old->list[o].stop < new->list[n].start)) + return 0; -out: - return overlap; + return min (old->list[o].stop, new->list[n].stop) - + max (old->list[o].start, new->list[n].start) + 1; } + int dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret) { @@ -82,7 +70,7 @@ dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret) local = frame->local; local->selfheal.dir_cbk (frame, NULL, frame->this, ret, - local->op_errno); + local->op_errno, NULL); return 0; } @@ -90,7 +78,7 @@ dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret) int dht_selfheal_dir_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -129,18 +117,32 @@ dht_selfheal_dir_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, - dht_layout_t *layout, int i) + dht_layout_t *layout, int i, + xlator_t *req_subvol) { xlator_t *subvol = NULL; dict_t *xattr = NULL; int ret = 0; xlator_t *this = NULL; int32_t *disk_layout = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; - - subvol = layout->list[i].xlator; + local = frame->local; + if (req_subvol) + subvol = req_subvol; + else + subvol = layout->list[i].xlator; this = frame->this; + GF_VALIDATE_OR_GOTO ("", this, err); + GF_VALIDATE_OR_GOTO (this->name, layout, err); + GF_VALIDATE_OR_GOTO (this->name, local, err); + GF_VALIDATE_OR_GOTO (this->name, subvol, err); + VALIDATE_OR_GOTO (this->private, err); + + conf = this->private; + xattr = get_new_dict (); if (!xattr) { goto err; @@ -154,8 +156,7 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, goto err; } - ret = dict_set_bin (xattr, "trusted.glusterfs.dht", - disk_layout, 4 * 4); + ret = dict_set_bin (xattr, conf->xattr_name, disk_layout, 4 * 4); if (ret == -1) { gf_log (this->name, GF_LOG_WARNING, "%s: (subvol %s) failed to set xattr dictionary", @@ -171,9 +172,12 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, dict_ref (xattr); + if (!uuid_is_null (local->gfid)) + uuid_copy (loc->gfid, local->gfid); + STACK_WIND (frame, dht_selfheal_dir_xattr_cbk, subvol, subvol->fops->setxattr, - loc, xattr, 0); + loc, xattr, 0, NULL); dict_unref (xattr); @@ -183,11 +187,10 @@ err: if (xattr) dict_destroy (xattr); - if (disk_layout) - GF_FREE (disk_layout); + GF_FREE (disk_layout); dht_selfheal_dir_xattr_cbk (frame, subvol, frame->this, - -1, ENOMEM); + -1, ENOMEM, NULL); return 0; } @@ -198,21 +201,42 @@ dht_fix_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) int i = 0; int count = 0; xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *dummy = NULL; local = frame->local; this = frame->this; + conf = this->private; gf_log (this->name, GF_LOG_DEBUG, "writing the new range for all subvolumes"); - local->call_cnt = count = layout->cnt; + local->call_cnt = count = conf->subvolume_cnt; for (i = 0; i < layout->cnt; i++) { - dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i); + dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i, NULL); if (--count == 0) - break; + goto out; } + /* if we are here, subvolcount > layout_count. subvols-per-directory + * option might be set here. We need to clear out layout from the + * non-participating subvolumes, else it will result in overlaps */ + dummy = dht_layout_new (this, 1); + if (!dummy) + goto out; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (_gf_false == + dht_is_subvol_in_layout (layout, conf->subvolumes[i])) { + dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0, + conf->subvolumes[i]); + if (--count == 0) + break; + } + } + + dht_layout_unref (this, dummy); +out: return 0; } @@ -223,14 +247,17 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) int missing_xattr = 0; int i = 0; xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *dummy = NULL; local = frame->local; this = frame->this; + conf = this->private; for (i = 0; i < layout->cnt; i++) { if (layout->list[i].err != -1 || !layout->list[i].stop) { /* err != -1 would mean xattr present on the directory - * or the directory is itself non existant. + * or the directory is non existent. * !layout->list[i].stop would mean layout absent */ @@ -254,18 +281,30 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) if (layout->list[i].err != -1 || !layout->list[i].stop) continue; - dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i); + dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i, NULL); if (--missing_xattr == 0) break; } + dummy = dht_layout_new (this, 1); + if (!dummy) + goto out; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (_gf_false == + dht_is_subvol_in_layout (layout, conf->subvolumes[i])) { + dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0, + conf->subvolumes[i]); + } + } + dht_layout_unref (this, dummy); +out: return 0; } int dht_selfheal_dir_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *statpre, - struct iatt *statpost) + struct iatt *statpost, dict_t *xdata) { dht_local_t *local = NULL; dht_layout_t *layout = NULL; @@ -306,6 +345,9 @@ dht_selfheal_dir_setattr (call_frame_t *frame, loc_t *loc, struct iatt *stbuf, return 0; } + if (!uuid_is_null (local->gfid)) + uuid_copy (loc->gfid, local->gfid); + local->call_cnt = missing_attr; for (i = 0; i < layout->cnt; i++) { if (layout->list[i].err == -1) { @@ -316,7 +358,7 @@ dht_selfheal_dir_setattr (call_frame_t *frame, loc_t *loc, struct iatt *stbuf, STACK_WIND (frame, dht_selfheal_dir_setattr_cbk, layout->list[i].xlator, layout->list[i].xlator->fops->setattr, - loc, stbuf, valid); + loc, stbuf, valid, NULL); } } @@ -327,7 +369,8 @@ int dht_selfheal_dir_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; dht_layout_t *layout = NULL; @@ -373,6 +416,46 @@ out: return 0; } +void +dht_selfheal_dir_mkdir_setacl (dict_t *xattr, dict_t *dict) +{ + data_t *acl_default = NULL; + data_t *acl_access = NULL; + xlator_t *this = NULL; + int ret = -1; + + GF_ASSERT (xattr); + GF_ASSERT (dict); + + this = THIS; + GF_ASSERT (this); + + acl_default = dict_get (xattr, POSIX_ACL_DEFAULT_XATTR); + + if (!acl_default) { + gf_log (this->name, GF_LOG_DEBUG, + "ACL_DEFAULT xattr not present"); + goto cont; + } + ret = dict_set (dict, POSIX_ACL_DEFAULT_XATTR, acl_default); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "Could not set ACL_DEFAULT xattr"); +cont: + acl_access = dict_get (xattr, POSIX_ACL_ACCESS_XATTR); + if (!acl_access) { + gf_log (this->name, GF_LOG_DEBUG, + "ACL_ACCESS xattr not present"); + goto out; + } + ret = dict_set (dict, POSIX_ACL_ACCESS_XATTR, acl_access); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "Could not set ACL_ACCESS xattr"); + +out: + return; +} int dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc, @@ -406,16 +489,19 @@ dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc, ret = dict_set_static_bin (dict, "gfid-req", local->gfid, 16); if (ret) - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_WARNING, "%s: failed to set gfid in dict", loc->path); } else if (local->params) { /* Send the dictionary from higher layers directly */ dict = dict_ref (local->params); } + /* Set acls */ + if (local->xattr && dict) + dht_selfheal_dir_mkdir_setacl (local->xattr, dict); if (!dict) gf_log (this->name, GF_LOG_WARNING, - "dict is NULL, need to make sure gfid's are same"); + "dict is NULL, need to make sure gfids are same"); for (i = 0; i < layout->cnt; i++) { if (layout->list[i].err == ENOENT || force) { @@ -429,7 +515,7 @@ dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc, loc, st_mode_from_ia (local->stbuf.ia_prot, local->stbuf.ia_type), - dict); + 0, dict); } } @@ -448,7 +534,7 @@ dht_selfheal_layout_alloc_start (xlator_t *this, loc_t *loc, uint32_t hashval = 0; int ret = 0; - ret = dht_hash_compute (layout->type, loc->path, &hashval); + ret = dht_hash_compute (this, layout->type, loc->path, &hashval); if (ret == 0) { start = (hashval % layout->cnt); } @@ -471,7 +557,7 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout) for (j = 0; j < conf->subvolume_cnt; j++) { if (conf->decommissioned_bricks[j] && conf->decommissioned_bricks[j] == layout->list[i].xlator) { - layout->list[i].err = -EINVAL; + layout->list[i].err = EINVAL; break; } } @@ -479,9 +565,33 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout) for (i = 0; i < layout->cnt; i++) { err = layout->list[i].err; - if (err == -1 || err == 0) { - layout->list[i].err = -1; + if (err == -1 || err == 0 || err == ENOENT) { + /* Setting list[i].err = -1 is an indication for + dht_selfheal_layout_new_directory() to assign + a range. We set it to -1 based on any one of + the three criteria: + + - err == -1 already, which means directory + existed but layout was not set on it. + + - err == 0, which means directory exists and + has an old layout piece which will be + overwritten now. + + - err == ENOENT, which means directory does + not exist (possibly racing with mkdir or + finishing half done mkdir). The missing + directory will be attempted to be recreated. + + It is important to note that it is safe + to race with mkdir() as self-heal and + mkdir are idempotent operations. Both will + strive to set the directory and layouts to + the same final state. + */ count++; + if (!err) + layout->list[i].err = -1; } } @@ -496,49 +606,126 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout) } } - count = ((layout->spread_cnt) ? layout->spread_cnt : - ((count) ? count : 1)); + /* if layout->spread_cnt is set, check if it is <= available + * subvolumes (down brick and decommissioned bricks are considered + * un-availbale). Else return count (available up bricks) */ + count = ((layout->spread_cnt && + (layout->spread_cnt <= count)) ? + layout->spread_cnt : ((count) ? count : 1)); return count; } +void dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, + dht_layout_t *new_layout); + +void dht_layout_entry_swap (dht_layout_t *layout, int i, int j); +void dht_layout_range_swap (dht_layout_t *layout, int i, int j); + +/* + * It's a bit icky using local variables in a macro, but it makes the rest + * of the code a lot clearer. + */ +#define OV_ENTRY(x,y) table[x*new->cnt+y] + +void +dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc, + dht_layout_t *new, dht_layout_t *old) +{ + int i = 0; + int j = 0; + uint32_t curr_overlap = 0; + uint32_t max_overlap = 0; + int max_overlap_idx = -1; + uint32_t overlap = 0; + uint32_t *table = NULL; + + dht_layout_sort_volname (old); + /* Now both old_layout->list[] and new_layout->list[] + are match the same xlators/subvolumes. i.e, + old_layout->[i] and new_layout->[i] are referring + to the same subvolumes + */ + + /* Build a table of overlaps between new[i] and old[j]. */ + table = alloca(sizeof(overlap)*old->cnt*new->cnt); + if (!table) { + return; + } + memset(table,0,sizeof(overlap)*old->cnt*new->cnt); + for (i = 0; i < new->cnt; ++i) { + for (j = 0; j < old->cnt; ++j) { + OV_ENTRY(i,j) = dht_overlap_calc(old,j,new,i); + } + } + + for (i = 0; i < new->cnt; i++) { + if (new->list[i].err > 0) { + /* Subvol might be marked for decommission + with EINVAL, or some other serious error + marked with positive errno. + */ + continue; + } + + max_overlap = 0; + max_overlap_idx = i; + for (j = (i + 1); j < new->cnt; ++j) { + if (new->list[j].err > 0) { + /* Subvol might be marked for decommission + with EINVAL, or some other serious error + marked with positive errno. + */ + continue; + } + /* Calculate the overlap now. */ + curr_overlap = OV_ENTRY(i,i) + OV_ENTRY(j,j); + /* Calculate the overlap after the proposed swap. */ + overlap = OV_ENTRY(i,j) + OV_ENTRY(j,i); + /* Are we better than status quo? */ + if (overlap > curr_overlap) { + overlap -= curr_overlap; + /* Are we better than the previous choice? */ + if (overlap > max_overlap) { + max_overlap = overlap; + max_overlap_idx = j; + } + } + } + + if (max_overlap_idx != i) { + dht_layout_range_swap (new, i, max_overlap_idx); + /* Need to swap the table values too. */ + for (j = 0; j < old->cnt; ++j) { + overlap = OV_ENTRY(i,j); + OV_ENTRY(i,j) = OV_ENTRY(max_overlap_idx,j); + OV_ENTRY(max_overlap_idx,j) = overlap; + } + } + } +} + + dht_layout_t * dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) { - uint32_t chunk = 0; - uint32_t start = 0; - uint32_t stop = 0; - uint32_t overlap = 0; - uint32_t max_overlap = 0; - uint32_t chunk_begin = 0; - int count = 0; - int cnt = 0; int i = 0; - int j = 0; - int k = 0; - int loop_cnt = 0; - int start_subvol = 0; - int *fix_array = NULL; xlator_t *this = NULL; dht_layout_t *new_layout = NULL; dht_conf_t *priv = NULL; dht_local_t *local = NULL; + uint32_t subvol_down = 0; + int ret = 0; this = frame->this; priv = this->private; local = frame->local; - count = cnt = dht_get_layout_count (this, layout, 0); - - chunk = ((unsigned long) 0xffffffff) / ((cnt) ? cnt : 1); - - start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout); - - fix_array = GF_CALLOC (sizeof (int), layout->cnt, gf_common_mt_char); - if (!fix_array) { - /* No fix, use the existing layout itself */ + if (layout->type == DHT_HASH_TYPE_DM_USER) { + gf_log (THIS->name, GF_LOG_DEBUG, "leaving %s alone", + loc->path); goto done; } @@ -546,98 +733,33 @@ dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc, if (!new_layout) goto done; - for (i = 0; i < new_layout->cnt; i++) { - /* TODO: fix this in layout_alloc() itself */ - new_layout->list[i].err = -ENOENT; - if (i < layout->cnt) - new_layout->list[i].xlator = layout->list[i].xlator; - } - - /* Check if there are any overlap in layout, and give the proper fix */ - for (i = 0; i < layout->cnt; i++) { - /* No need to fix if 'err' is not '-1' */ - if (layout->list[i].err != -1) - continue; - - /* If already existing layout is having no range, skip it */ - start = layout->list[i].start; - stop = layout->list[i].stop; - if ((stop - start) == 0) - continue; - - max_overlap = 0; + /* If a subvolume is down, do not re-write the layout. */ + ret = dht_layout_anomalies (this, loc, layout, NULL, NULL, NULL, + &subvol_down, NULL, NULL); - /* 'j' is used as starting point of each chunk */ - for (j = 1; j <= count; j++) { - /* if chunk is already used, don't use it again */ - for (k = 0; k < i; k++) - if (j == fix_array[k]) - break; - if (k < i) - continue; - - overlap = dht_find_overlap (i, (j-1), start, stop, chunk); - if (max_overlap < overlap) { - max_overlap = overlap; - fix_array[i] = j; - } - } - - /* If we have any overlap, then use that itself as new - layout for the subvolume */ - if (fix_array[i]) { - chunk_begin = chunk * (fix_array[i] - 1); - new_layout->list[i].err = -1; - DHT_SET_LAYOUT_RANGE (new_layout, i, chunk_begin, - chunk, cnt, loc->path); - /* make sure to give (max - 1) as 'stop' range, - if it is last chunk */ - if (fix_array[i] == count) - new_layout->list[i].stop = 0xffffffff; - if (--cnt == 0) - goto done; - - } + if (subvol_down || (ret == -1)) { + gf_log (this->name, GF_LOG_WARNING, "%u subvolume(s) are down" + ". Skipping fix layout.", subvol_down); + GF_FREE (new_layout); + return NULL; } - /* Now, look for layouts which are not having any overlaps - and give it a fix */ - for (loop_cnt = 0, i = start_subvol; loop_cnt < new_layout->cnt; - i++, loop_cnt++) { - if (i == new_layout->cnt) - i = 0; - - /* If 'fix_array[i]' is set, the layout is already fixed. */ - if (fix_array[i]) - continue; + for (i = 0; i < new_layout->cnt; i++) { + if (layout->list[i].err != ENOSPC) + new_layout->list[i].err = layout->list[i].err; + else + new_layout->list[i].err = -1; - if (layout->list[i].err != -1) { - new_layout->list[i].err = layout->list[i].err; - continue; - } + new_layout->list[i].xlator = layout->list[i].xlator; + } - for (k = 1; k <= count; k++) { - for (j = 0; j < new_layout->cnt; j++) { - if (k == fix_array[j]) - break; - } - /* Didn't find any of the list begining with 'k' */ - if (j == new_layout->cnt) - break; - } + /* First give it a layout as though it is a new directory. This + ensures rotation to kick in */ + dht_layout_sort_volname (new_layout); + dht_selfheal_layout_new_directory (frame, loc, new_layout); - fix_array[i] = k; - chunk_begin = (k - 1) * chunk; - new_layout->list[i].err = -1; - DHT_SET_LAYOUT_RANGE (new_layout, i, chunk_begin, chunk, cnt, - loc->path); - /* make sure to give (max - 1) as 'stop' range, - if it is last chunk */ - if (k == count) - new_layout->list[i].stop = 0xffffffff; - if (--cnt == 0) - goto done; - } + /* Now selectively re-assign ranges only when it helps */ + dht_selfheal_layout_maximize_overlap (frame, loc, new_layout, layout); done: if (new_layout) { @@ -651,7 +773,7 @@ done: local->layout = new_layout; } - return new_layout; + return local->layout; } @@ -675,9 +797,11 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout); + /* clear out the range, as we are re-computing here */ + DHT_RESET_LAYOUT_RANGE (layout); for (i = start_subvol; i < layout->cnt; i++) { err = layout->list[i].err; - if (err == -1) { + if (err == -1 || err == ENOENT) { DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, cnt, loc->path); if (--cnt == 0) { @@ -690,7 +814,7 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, for (i = 0; i < start_subvol; i++) { err = layout->list[i].err; - if (err == -1) { + if (err == -1 || err == ENOENT) { DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, cnt, loc->path); if (--cnt == 0) { @@ -709,35 +833,17 @@ int dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) { - dht_conf_t *conf = NULL; - xlator_t *this = NULL; dht_local_t *local = NULL; - int missing = -1; - int down = -1; - int holes = -1; + uint32_t holes = 0; int ret = -1; int i = -1; - int overlaps = -1; + uint32_t overlaps = 0; - this = frame->this; - conf = this->private; local = frame->local; - missing = local->selfheal.missing; - down = local->selfheal.down; holes = local->selfheal.hole_cnt; overlaps = local->selfheal.overlaps_cnt; - if ((missing + down) == conf->subvolume_cnt) { - dht_selfheal_layout_new_directory (frame, loc, layout); - ret = 0; - } - - if (holes <= down) { - /* the down subvol might fill up the holes */ - ret = 0; - } - if (holes || overlaps) { dht_selfheal_layout_new_directory (frame, loc, layout); ret = 0; @@ -789,6 +895,9 @@ dht_fix_directory_layout (call_frame_t *frame, /* No layout sorting required here */ tmp_layout = dht_fix_layout_of_directory (frame, &local->loc, layout); + if (!tmp_layout) { + return -1; + } dht_fix_dir_xattr (frame, &local->loc, tmp_layout); return 0; @@ -811,9 +920,8 @@ dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, dht_layout_anomalies (this, loc, layout, &local->selfheal.hole_cnt, &local->selfheal.overlaps_cnt, - &local->selfheal.missing, - &local->selfheal.down, - &local->selfheal.misc); + NULL, &local->selfheal.down, + &local->selfheal.misc, NULL); down = local->selfheal.down; misc = local->selfheal.misc; @@ -822,14 +930,14 @@ dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, local->selfheal.layout = dht_layout_ref (this, layout); if (down) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_WARNING, "%d subvolumes down -- not fixing", down); ret = 0; goto sorry_no_fix; } if (misc) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_WARNING, "%d subvolumes have unrecoverable errors", misc); ret = 0; goto sorry_no_fix; @@ -839,7 +947,7 @@ dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, ret = dht_selfheal_dir_getafix (frame, loc, layout); if (ret == -1) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_WARNING, "not able to form layout for the directory"); goto sorry_no_fix; } @@ -872,3 +980,50 @@ dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, return ret; } + +int +dht_dir_attr_heal (void *data) +{ + call_frame_t *frame = NULL; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + int call_cnt = 0; + int ret = -1; + int i = 0; + + GF_VALIDATE_OR_GOTO ("dht", data, out); + + frame = data; + local = frame->local; + this = frame->this; + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", local, out); + conf = this->private; + GF_VALIDATE_OR_GOTO ("dht", conf, out); + + call_cnt = conf->subvolume_cnt; + + for (i = 0; i < call_cnt; i++) { + subvol = conf->subvolumes[i]; + if (!subvol || (subvol == dht_first_up_subvol (this))) + continue; + ret = syncop_setattr (subvol, &local->loc, &local->stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID), + NULL, NULL); + if (ret) + gf_log ("dht", GF_LOG_ERROR, "Failed to set uid/gid on" + " %s on %s subvol (%s)", local->loc.path, + subvol->name, strerror (errno)); + } +out: + return 0; +} + +int +dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data) +{ + DHT_STACK_DESTROY (sync_frame); + return 0; +} diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c new file mode 100644 index 000000000..70aac7710 --- /dev/null +++ b/xlators/cluster/dht/src/dht-shared.c @@ -0,0 +1,758 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +/* TODO: add NS locking */ + +#include "statedump.h" +#include "dht-common.h" + +/* TODO: + - use volumename in xattr instead of "dht" + - use NS locks + - handle all cases in self heal layout reconstruction + - complete linkfile selfheal +*/ +struct volume_options options[]; + +void +dht_layout_dump (dht_layout_t *layout, const char *prefix) +{ + + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; + + if (!layout) + goto out; + if (!prefix) + goto out; + + gf_proc_dump_build_key(key, prefix, "cnt"); + gf_proc_dump_write(key, "%d", layout->cnt); + gf_proc_dump_build_key(key, prefix, "preset"); + gf_proc_dump_write(key, "%d", layout->preset); + gf_proc_dump_build_key(key, prefix, "gen"); + gf_proc_dump_write(key, "%d", layout->gen); + if (layout->type != IA_INVAL) { + gf_proc_dump_build_key(key, prefix, "inode type"); + gf_proc_dump_write(key, "%d", layout->type); + } + + if (!IA_ISDIR (layout->type)) + goto out; + + for (i = 0; i < layout->cnt; i++) { + gf_proc_dump_build_key(key, prefix,"list[%d].err", i); + gf_proc_dump_write(key, "%d", layout->list[i].err); + gf_proc_dump_build_key(key, prefix,"list[%d].start", i); + gf_proc_dump_write(key, "%u", layout->list[i].start); + gf_proc_dump_build_key(key, prefix,"list[%d].stop", i); + gf_proc_dump_write(key, "%u", layout->list[i].stop); + if (layout->list[i].xlator) { + gf_proc_dump_build_key(key, prefix, + "list[%d].xlator.type", i); + gf_proc_dump_write(key, "%s", + layout->list[i].xlator->type); + gf_proc_dump_build_key(key, prefix, + "list[%d].xlator.name", i); + gf_proc_dump_write(key, "%s", + layout->list[i].xlator->name); + } + } + +out: + return; +} + + +int32_t +dht_priv_dump (xlator_t *this) +{ + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; + dht_conf_t *conf = NULL; + int ret = -1; + + if (!this) + goto out; + + conf = this->private; + if (!conf) + goto out; + + ret = TRY_LOCK(&conf->subvolume_lock); + if (ret != 0) { + return ret; + } + + gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name); + gf_proc_dump_build_key(key_prefix,"xlator.cluster.dht","%s.priv", + this->name); + gf_proc_dump_write("subvol_cnt","%d", conf->subvolume_cnt); + for (i = 0; i < conf->subvolume_cnt; i++) { + sprintf (key, "subvolumes[%d]", i); + gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type, + conf->subvolumes[i]->name); + if (conf->file_layouts && conf->file_layouts[i]){ + sprintf (key, "file_layouts[%d]", i); + dht_layout_dump(conf->file_layouts[i], key); + } + if (conf->dir_layouts && conf->dir_layouts[i]) { + sprintf (key, "dir_layouts[%d]", i); + dht_layout_dump(conf->dir_layouts[i], key); + } + if (conf->subvolume_status) { + + sprintf (key, "subvolume_status[%d]", i); + gf_proc_dump_write(key, "%d", + (int)conf->subvolume_status[i]); + } + + } + + gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed); + gf_proc_dump_write("gen", "%d", conf->gen); + gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk); + gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes); + gf_proc_dump_write("disk_unit", "%c", conf->disk_unit); + gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval); + gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit); + if (conf ->du_stats) { + gf_proc_dump_write("du_stats.avail_percent", "%lf", + conf->du_stats->avail_percent); + gf_proc_dump_write("du_stats.avail_space", "%lu", + conf->du_stats->avail_space); + gf_proc_dump_write("du_stats.avail_inodes", "%lf", + conf->du_stats->avail_inodes); + gf_proc_dump_write("du_stats.log", "%lu", conf->du_stats->log); + } + + if (conf->last_stat_fetch.tv_sec) + gf_proc_dump_write("last_stat_fetch", "%s", + ctime(&conf->last_stat_fetch.tv_sec)); + + UNLOCK(&conf->subvolume_lock); + +out: + return ret; +} + +int32_t +dht_inodectx_dump (xlator_t *this, inode_t *inode) +{ + int ret = -1; + dht_layout_t *layout = NULL; + + if (!this) + goto out; + if (!inode) + goto out; + + ret = dht_inode_ctx_layout_get (inode, this, &layout); + + if ((ret != 0) || !layout) + return ret; + + gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name); + dht_layout_dump(layout, "layout"); + +out: + return ret; +} + +void +dht_fini (xlator_t *this) +{ + int i = 0; + dht_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + + conf = this->private; + this->private = NULL; + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + GF_FREE (conf->file_layouts[i]); + } + GF_FREE (conf->file_layouts); + } + + GF_FREE (conf->subvolumes); + + GF_FREE (conf->subvolume_status); + + GF_FREE (conf); + } +out: + return; +} + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + + ret = xlator_mem_acct_init (this, gf_dht_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + return ret; + } +out: + return ret; +} + + +int +dht_parse_decommissioned_bricks (xlator_t *this, dht_conf_t *conf, + const char *bricks) +{ + int i = 0; + int ret = -1; + char *tmpstr = NULL; + char *dup_brick = NULL; + char *node = NULL; + + if (!conf || !bricks) + goto out; + + dup_brick = gf_strdup (bricks); + node = strtok_r (dup_brick, ",", &tmpstr); + while (node) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!strcmp (conf->subvolumes[i]->name, node)) { + conf->decommissioned_bricks[i] = + conf->subvolumes[i]; + conf->decommission_subvols_cnt++; + gf_log (this->name, GF_LOG_INFO, + "decommissioning subvolume %s", + conf->subvolumes[i]->name); + break; + } + } + if (i == conf->subvolume_cnt) { + /* Wrong node given. */ + goto out; + } + node = strtok_r (NULL, ",", &tmpstr); + } + + ret = 0; + conf->decommission_in_progress = 1; +out: + GF_FREE (dup_brick); + + return ret; +} + + +int +dht_decommissioned_remove (xlator_t *this, dht_conf_t *conf) +{ + int i = 0; + int ret = -1; + + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i]) { + conf->decommissioned_bricks[i] = NULL; + conf->decommission_subvols_cnt--; + } + } + + ret = 0; +out: + + return ret; +} +void +dht_init_regex (xlator_t *this, dict_t *odict, char *name, + regex_t *re, gf_boolean_t *re_valid) +{ + char *temp_str; + + if (dict_get_str (odict, name, &temp_str) != 0) { + if (strcmp(name,"rsync-hash-regex")) { + return; + } + temp_str = "^\\.(.+)\\.[^.]+$"; + } + + if (*re_valid) { + regfree(re); + *re_valid = _gf_false; + } + + if (!strcmp(temp_str,"none")) { + return; + } + + if (regcomp(re,temp_str,REG_EXTENDED) == 0) { + gf_log (this->name, GF_LOG_INFO, + "using regex %s = %s", name, temp_str); + *re_valid = _gf_true; + } + else { + gf_log (this->name, GF_LOG_WARNING, + "compiling regex %s failed", temp_str); + } +} + +int +dht_reconfigure (xlator_t *this, dict_t *options) +{ + dht_conf_t *conf = NULL; + char *temp_str = NULL; + gf_boolean_t search_unhashed; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", options, out); + + conf = this->private; + if (!conf) + return 0; + + if (dict_get_str (options, "lookup-unhashed", &temp_str) == 0) { + /* If option is not "auto", other options _should_ be boolean*/ + if (strcasecmp (temp_str, "auto")) { + if (!gf_string2boolean (temp_str, &search_unhashed)) { + gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" + " lookup-unhashed reconfigured (%s)", + temp_str); + conf->search_unhashed = search_unhashed; + } else { + gf_log(this->name, GF_LOG_ERROR, "Reconfigure:" + " lookup-unhashed should be boolean," + " not (%s), defaulting to (%d)", + temp_str, conf->search_unhashed); + //return -1; + ret = -1; + goto out; + } + } else { + gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" + " lookup-unhashed reconfigured auto "); + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; + } + } + + GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options, + percent_or_size, out); + /* option can be any one of percent or bytes */ + conf->disk_unit = 0; + if (conf->min_free_disk < 100.0) + conf->disk_unit = 'p'; + + GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options, + percent, out); + + GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt, + options, uint32, out); + + GF_OPTION_RECONF ("readdir-optimize", conf->readdir_optimize, options, + bool, out); + if (conf->defrag) { + GF_OPTION_RECONF ("rebalance-stats", conf->defrag->stats, + options, bool, out); + } + + if (dict_get_str (options, "decommissioned-bricks", &temp_str) == 0) { + ret = dht_parse_decommissioned_bricks (this, conf, temp_str); + if (ret == -1) + goto out; + } else { + ret = dht_decommissioned_remove (this, conf); + if (ret == -1) + goto out; + } + + dht_init_regex (this, options, "rsync-hash-regex", + &conf->rsync_regex, &conf->rsync_regex_valid); + dht_init_regex (this, options, "extra-hash-regex", + &conf->extra_regex, &conf->extra_regex_valid); + + ret = 0; +out: + return ret; +} + +static int +gf_defrag_pattern_list_fill (xlator_t *this, gf_defrag_info_t *defrag, char *data) +{ + int ret = -1; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *dup_str = NULL; + char *num = NULL; + char *pattern_str = NULL; + char *pattern = NULL; + gf_defrag_pattern_list_t *temp_list = NULL; + gf_defrag_pattern_list_t *pattern_list = NULL; + + if (!this || !defrag || !data) + goto out; + + /* Get the pattern for pattern list. "pattern:<optional-size>" + * eg: *avi, *pdf:10MB, *:1TB + */ + pattern_str = strtok_r (data, ",", &tmp_str); + while (pattern_str) { + dup_str = gf_strdup (pattern_str); + pattern_list = GF_CALLOC (1, sizeof (gf_defrag_pattern_list_t), + 1); + if (!pattern_list) { + goto out; + } + pattern = strtok_r (dup_str, ":", &tmp_str1); + num = strtok_r (NULL, ":", &tmp_str1); + if (!pattern) + goto out; + if (!num) { + if (gf_string2bytesize(pattern, &pattern_list->size) + == 0) { + pattern = "*"; + } + } else if (gf_string2bytesize (num, &pattern_list->size) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", num); + goto out; + } + memcpy (pattern_list->path_pattern, pattern, strlen (dup_str)); + + if (!defrag->defrag_pattern) + temp_list = NULL; + else + temp_list = defrag->defrag_pattern; + + pattern_list->next = temp_list; + + defrag->defrag_pattern = pattern_list; + pattern_list = NULL; + + GF_FREE (dup_str); + dup_str = NULL; + + pattern_str = strtok_r (NULL, ",", &tmp_str); + } + + ret = 0; +out: + if (ret) + GF_FREE (pattern_list); + GF_FREE (dup_str); + + return ret; +} + +int +dht_init (xlator_t *this) +{ + dht_conf_t *conf = NULL; + char *temp_str = NULL; + int ret = -1; + int i = 0; + gf_defrag_info_t *defrag = NULL; + int cmd = 0; + char *node_uuid = NULL; + + + GF_VALIDATE_OR_GOTO ("dht", this, err); + + if (!this->children) { + gf_log (this->name, GF_LOG_CRITICAL, + "Distribute needs more than one subvolume"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile"); + } + + conf = GF_CALLOC (1, sizeof (*conf), gf_dht_mt_dht_conf_t); + if (!conf) { + goto err; + } + + ret = dict_get_int32 (this->options, "rebalance-cmd", &cmd); + + if (cmd) { + defrag = GF_CALLOC (1, sizeof (gf_defrag_info_t), + gf_defrag_info_mt); + + GF_VALIDATE_OR_GOTO (this->name, defrag, err); + + LOCK_INIT (&defrag->lock); + + defrag->is_exiting = 0; + + conf->defrag = defrag; + + ret = dict_get_str (this->options, "node-uuid", &node_uuid); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "node-uuid not " + "specified"); + goto err; + } + + if (uuid_parse (node_uuid, defrag->node_uuid)) { + gf_log (this->name, GF_LOG_ERROR, "Cannot parse " + "glusterd node uuid"); + goto err; + } + + defrag->cmd = cmd; + + defrag->stats = _gf_false; + } + + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; + if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { + /* If option is not "auto", other options _should_ be boolean */ + if (strcasecmp (temp_str, "auto")) + gf_string2boolean (temp_str, &conf->search_unhashed); + else + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; + } + + GF_OPTION_INIT ("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool, + err); + + GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err); + + GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size, + err); + + GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent, + err); + + conf->dir_spread_cnt = conf->subvolume_cnt; + GF_OPTION_INIT ("directory-layout-spread", conf->dir_spread_cnt, + uint32, err); + + GF_OPTION_INIT ("assert-no-child-down", conf->assert_no_child_down, + bool, err); + + GF_OPTION_INIT ("readdir-optimize", conf->readdir_optimize, bool, err); + + if (defrag) { + GF_OPTION_INIT ("rebalance-stats", defrag->stats, bool, err); + if (dict_get_str (this->options, "rebalance-filter", &temp_str) + == 0) { + if (gf_defrag_pattern_list_fill (this, defrag, temp_str) + == -1) { + gf_log (this->name, GF_LOG_ERROR, "Cannot parse" + " rebalance-filter (%s)", temp_str); + goto err; + } + } + } + + /* option can be any one of percent or bytes */ + conf->disk_unit = 0; + if (conf->min_free_disk < 100) + conf->disk_unit = 'p'; + + ret = dht_init_subvolumes (this, conf); + if (ret == -1) { + goto err; + } + + if (dict_get_str (this->options, "decommissioned-bricks", &temp_str) == 0) { + ret = dht_parse_decommissioned_bricks (this, conf, temp_str); + if (ret == -1) + goto err; + } + + dht_init_regex (this, this->options, "rsync-hash-regex", + &conf->rsync_regex, &conf->rsync_regex_valid); + dht_init_regex (this, this->options, "extra-hash-regex", + &conf->extra_regex, &conf->extra_regex_valid); + + ret = dht_layouts_init (this, conf); + if (ret == -1) { + goto err; + } + + LOCK_INIT (&conf->subvolume_lock); + LOCK_INIT (&conf->layout_lock); + + conf->gen = 1; + + this->local_pool = mem_pool_new (dht_local_t, 512); + if (!this->local_pool) { + gf_log (this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + goto err; + } + + GF_OPTION_INIT ("xattr-name", conf->xattr_name, str, err); + gf_asprintf (&conf->link_xattr_name, "%s.linkto", conf->xattr_name); + gf_asprintf (&conf->wild_xattr_name, "%s*", conf->xattr_name); + if (!conf->link_xattr_name || !conf->wild_xattr_name) { + goto err; + } + + this->private = conf; + + return 0; + +err: + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + GF_FREE (conf->file_layouts[i]); + } + GF_FREE (conf->file_layouts); + } + + GF_FREE (conf->subvolumes); + + GF_FREE (conf->subvolume_status); + + GF_FREE (conf->du_stats); + + GF_FREE (conf->defrag); + + GF_FREE (conf->xattr_name); + GF_FREE (conf->link_xattr_name); + GF_FREE (conf->wild_xattr_name); + + GF_FREE (conf); + } + + return -1; +} + + +struct volume_options options[] = { + { .key = {"lookup-unhashed"}, + .value = {"auto", "yes", "no", "enable", "disable", "1", "0", + "on", "off"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "on", + .description = "This option if set to ON, does a lookup through " + "all the sub-volumes, in case a lookup didn't return any result " + "from the hash subvolume. If set to OFF, it does not do a lookup " + "on the remaining subvolumes." + }, + { .key = {"min-free-disk"}, + .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, + .default_value = "10%", + .description = "Percentage/Size of disk space, after which the " + "process starts balancing out the cluster, and logs will appear " + "in log files", + }, + { .key = {"min-free-inodes"}, + .type = GF_OPTION_TYPE_PERCENT, + .default_value = "5%", + .description = "after system has only N% of inodes, warnings " + "starts to appear in log files", + }, + { .key = {"unhashed-sticky-bit"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, + { .key = {"use-readdirp"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "This option if set to ON, forces the use of " + "readdirp, and hence also displays the stats of the files." + }, + { .key = {"assert-no-child-down"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option if set to ON, in the event of " + "CHILD_DOWN, will call exit." + }, + { .key = {"directory-layout-spread"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .validate = GF_OPT_VALIDATE_MIN, + .description = "Specifies the directory layout spread." + }, + { .key = {"decommissioned-bricks"}, + .type = GF_OPTION_TYPE_ANY, + .description = "This option if set to ON, decommissions " + "the brick, so that no new data is allowed to be created " + "on that brick." + }, + { .key = {"rebalance-cmd"}, + .type = GF_OPTION_TYPE_INT, + }, + { .key = {"node-uuid"}, + .type = GF_OPTION_TYPE_STR, + }, + { .key = {"rebalance-stats"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option if set to ON displays and logs the " + " time taken for migration of each file, during the rebalance " + "process. If set to OFF, the rebalance logs will only display the " + "time spent in each directory." + }, + { .key = {"readdir-optimize"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option if set to ON enables the optimization " + "that allows DHT to requests non-first subvolumes to filter out " + "directory entries." + }, + { .key = {"rsync-hash-regex"}, + .type = GF_OPTION_TYPE_STR, + /* Setting a default here doesn't work. See dht_init_regex. */ + .description = "Regular expression for stripping temporary-file " + "suffix and prefix used by rsync, to prevent relocation when the " + "file is renamed." + }, + { .key = {"extra-hash-regex"}, + .type = GF_OPTION_TYPE_STR, + /* Setting a default here doesn't work. See dht_init_regex. */ + .description = "Regular expression for stripping temporary-file " + "suffix and prefix used by an application, to prevent relocation when " + "the file is renamed." + }, + { .key = {"rebalance-filter"}, + .type = GF_OPTION_TYPE_STR, + }, + + { .key = {"xattr-name"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "trusted.glusterfs.dht", + .description = "Base for extended attributes used by this " + "translator instance, to avoid conflicts with others above or " + "below it." + }, + + /* NUFA option */ + { .key = {"local-volume-name"}, + .type = GF_OPTION_TYPE_XLATOR + }, + + /* switch option */ + { .key = {"pattern.switch.case"}, + .type = GF_OPTION_TYPE_ANY + }, + + { .key = {NULL} }, +}; diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c index d9499a407..fc0ca2f77 100644 --- a/xlators/cluster/dht/src/dht.c +++ b/xlators/cluster/dht/src/dht.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -23,443 +14,15 @@ #include "config.h" #endif -/* TODO: add NS locking */ - #include "statedump.h" #include "dht-common.h" -/* TODO: - - use volumename in xattr instead of "dht" - - use NS locks - - handle all cases in self heal layout reconstruction - - complete linkfile selfheal -*/ -struct volume_options options[]; - -void -dht_layout_dump (dht_layout_t *layout, const char *prefix) -{ - - char key[GF_DUMP_MAX_BUF_LEN]; - int i = 0; - - GF_VALIDATE_OR_GOTO ("dht", layout, out); - GF_VALIDATE_OR_GOTO ("dht", prefix, out); - - gf_proc_dump_build_key(key, prefix, "cnt"); - gf_proc_dump_write(key, "%d", layout->cnt); - gf_proc_dump_build_key(key, prefix, "preset"); - gf_proc_dump_write(key, "%d", layout->preset); - gf_proc_dump_build_key(key, prefix, "gen"); - gf_proc_dump_write(key, "%d", layout->gen); - gf_proc_dump_build_key(key, prefix, "type"); - gf_proc_dump_write(key, "%d", layout->type); - - for (i = 0; i < layout->cnt; i++) { - gf_proc_dump_build_key(key, prefix,"list[%d].err", i); - gf_proc_dump_write(key, "%d", layout->list[i].err); - gf_proc_dump_build_key(key, prefix,"list[%d].start", i); - gf_proc_dump_write(key, "%u", layout->list[i].start); - gf_proc_dump_build_key(key, prefix,"list[%d].stop", i); - gf_proc_dump_write(key, "%u", layout->list[i].stop); - if (layout->list[i].xlator) { - gf_proc_dump_build_key(key, prefix, - "list[%d].xlator.type", i); - gf_proc_dump_write(key, "%s", - layout->list[i].xlator->type); - gf_proc_dump_build_key(key, prefix, - "list[%d].xlator.name", i); - gf_proc_dump_write(key, "%s", - layout->list[i].xlator->name); - } - } - -out: - return; -} - - -int32_t -dht_priv_dump (xlator_t *this) -{ - char key_prefix[GF_DUMP_MAX_BUF_LEN]; - char key[GF_DUMP_MAX_BUF_LEN]; - int i = 0; - dht_conf_t *conf = NULL; - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - - conf = this->private; - - if (!conf) - return -1; - - ret = TRY_LOCK(&conf->subvolume_lock); - - if (ret != 0) { - gf_log("", GF_LOG_WARNING, "Unable to lock dht subvolume %s", - this->name); - return ret; - } - - gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name); - gf_proc_dump_build_key(key_prefix,"xlator.cluster.dht","%s.priv", - this->name); - gf_proc_dump_build_key(key, key_prefix, "subvolume_cnt"); - gf_proc_dump_write(key,"%d", conf->subvolume_cnt); - for (i = 0; i < conf->subvolume_cnt; i++) { - gf_proc_dump_build_key(key, key_prefix, "subvolumes[%d]", i); - gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type, - conf->subvolumes[i]->name); - if (conf->file_layouts && conf->file_layouts[i]){ - gf_proc_dump_build_key(key, key_prefix, - "file_layouts[%d]",i); - dht_layout_dump(conf->file_layouts[i], key); - } - if (conf->dir_layouts && conf->dir_layouts[i]) { - gf_proc_dump_build_key(key, key_prefix, - "dir_layouts[%d]",i); - dht_layout_dump(conf->dir_layouts[i], key); - } - if (conf->subvolume_status) { - gf_proc_dump_build_key(key, key_prefix, - "subvolume_status[%d]", i); - gf_proc_dump_write(key, "%d", - (int)conf->subvolume_status[i]); - } - - } - - gf_proc_dump_build_key(key, key_prefix, "search_unhashed"); - gf_proc_dump_write(key, "%d", conf->search_unhashed); - gf_proc_dump_build_key(key, key_prefix, "gen"); - gf_proc_dump_write(key, "%d", conf->gen); - gf_proc_dump_build_key(key, key_prefix, "min_free_disk"); - gf_proc_dump_write(key, "%lu", conf->min_free_disk); - gf_proc_dump_build_key(key, key_prefix, "disk_unit"); - gf_proc_dump_write(key, "%c", conf->disk_unit); - gf_proc_dump_build_key(key, key_prefix, "refresh_interval"); - gf_proc_dump_write(key, "%d", conf->refresh_interval); - gf_proc_dump_build_key(key, key_prefix, "unhashed_sticky_bit"); - gf_proc_dump_write(key, "%d", conf->unhashed_sticky_bit); - if (conf ->du_stats) { - gf_proc_dump_build_key(key, key_prefix, - "du_stats.avail_percent"); - gf_proc_dump_write(key, "%lf", conf->du_stats->avail_percent); - gf_proc_dump_build_key(key, key_prefix, - "du_stats.avail_space"); - gf_proc_dump_write(key, "%lu", conf->du_stats->avail_space); - gf_proc_dump_build_key(key, key_prefix, - "du_stats.log"); - gf_proc_dump_write(key, "%lu", conf->du_stats->log); - } - gf_proc_dump_build_key(key, key_prefix, "last_stat_fetch"); - gf_proc_dump_write(key, "%s", ctime(&conf->last_stat_fetch.tv_sec)); - - UNLOCK(&conf->subvolume_lock); - -out: - return ret; -} - -int32_t -dht_inodectx_dump (xlator_t *this, inode_t *inode) -{ - int ret = -1; - char key_prefix[GF_DUMP_MAX_BUF_LEN]; - dht_layout_t *layout = NULL; - uint64_t tmp_layout = 0; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", inode, out); - - ret = inode_ctx_get (inode, this, &tmp_layout); - - if (ret != 0) - return ret; - - layout = (dht_layout_t *)(long)tmp_layout; - - if (!layout) - return -1; - - gf_proc_dump_build_key(key_prefix, "xlator.cluster.dht", - "%s.inode.%ld", this->name, inode->ino); - dht_layout_dump(layout, key_prefix); - -out: - return ret; -} - -int -notify (xlator_t *this, int event, void *data, ...) -{ - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - - ret = dht_notify (this, event, data); - -out: - return ret; -} - -void -fini (xlator_t *this) -{ - int i = 0; - dht_conf_t *conf = NULL; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - - conf = this->private; - this->private = NULL; - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - if (conf->subvolumes) - GF_FREE (conf->subvolumes); - - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); - - GF_FREE (conf); - } -out: - return; -} - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - - ret = xlator_mem_acct_init (this, gf_dht_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } -out: - return ret; -} - - -int -dht_parse_decommissioned_bricks (xlator_t *this, dht_conf_t *conf, - const char *bricks) -{ - int i = 0; - int ret = -1; - char *tmpstr = NULL; - char *dup_brick = NULL; - char *node = NULL; - - if (!conf || !bricks) - goto out; - - dup_brick = gf_strdup (bricks); - node = strtok_r (dup_brick, ",", &tmpstr); - while (node) { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (!strcmp (conf->subvolumes[i]->name, node)) { - conf->decommissioned_bricks[i] = - conf->subvolumes[i]; - gf_log (this->name, GF_LOG_INFO, - "decommissioning subvolume %s", - conf->subvolumes[i]->name); - break; - } - } - if (i == conf->subvolume_cnt) { - /* Wrong node given. */ - goto out; - } - node = strtok_r (NULL, ",", &tmpstr); - } - - ret = 0; -out: - if (dup_brick) - GF_FREE (dup_brick); - - return ret; -} - -int -reconfigure (xlator_t *this, dict_t *options) -{ - dht_conf_t *conf = NULL; - char *temp_str = NULL; - gf_boolean_t search_unhashed; - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", options, out); - - conf = this->private; - if (!conf) - return 0; - - if (dict_get_str (options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean*/ - if (strcasecmp (temp_str, "auto")) { - if (!gf_string2boolean (temp_str, &search_unhashed)) { - gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" - " lookup-unahashed reconfigured (%s)", - temp_str); - conf->search_unhashed = search_unhashed; - } else { - gf_log(this->name, GF_LOG_ERROR, "Reconfigure:" - " lookup-unahashed should be boolean," - " not (%s), defaulting to (%d)", - temp_str, conf->search_unhashed); - //return -1; - ret = -1; - goto out; - } - } else { - gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" - " lookup-unahashed reconfigured auto "); - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; - } - } - - GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options, - percent_or_size, out); - - GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt, - options, uint32, out); - - if (dict_get_str (options, "decommissioned-bricks", &temp_str) == 0) { - ret = dht_parse_decommissioned_bricks (this, conf, temp_str); - if (ret == -1) - goto out; - } - - ret = 0; -out: - return ret; -} - - -int -init (xlator_t *this) -{ - dht_conf_t *conf = NULL; - char *temp_str = NULL; - int ret = -1; - int i = 0; - - GF_VALIDATE_OR_GOTO ("dht", this, err); - - if (!this->children) { - gf_log (this->name, GF_LOG_CRITICAL, - "Distribute needs more than one subvolume"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile"); - } - - conf = GF_CALLOC (1, sizeof (*conf), gf_dht_mt_dht_conf_t); - if (!conf) { - goto err; - } - - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; - if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean */ - if (strcasecmp (temp_str, "auto")) - gf_string2boolean (temp_str, &conf->search_unhashed); - else - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; - } - - GF_OPTION_INIT ("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool, - err); - - GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err); - - GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size, - err); - - conf->dir_spread_cnt = conf->subvolume_cnt; - GF_OPTION_INIT ("directory-layout-spread", conf->dir_spread_cnt, - uint32, err); - - GF_OPTION_INIT ("assert-no-child-down", conf->assert_no_child_down, - bool, err); - - ret = dht_init_subvolumes (this, conf); - if (ret == -1) { - goto err; - } - - if (dict_get_str (this->options, "decommissioned-bricks", &temp_str) == 0) { - ret = dht_parse_decommissioned_bricks (this, conf, temp_str); - if (ret == -1) - goto err; - } - - ret = dht_layouts_init (this, conf); - if (ret == -1) { - goto err; - } - - LOCK_INIT (&conf->subvolume_lock); - LOCK_INIT (&conf->layout_lock); - - conf->gen = 1; - - /* Create 'syncop' environment */ - conf->env = syncenv_new (0); - if (!conf->env) { - gf_log (this->name, GF_LOG_ERROR, - "failed to create sync environment %s", - strerror (errno)); - goto err; - } - - this->private = conf; - - return 0; - -err: - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - if (conf->subvolumes) - GF_FREE (conf->subvolumes); - - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); - - if (conf->du_stats) - GF_FREE (conf->du_stats); - - GF_FREE (conf); - } - - return -1; -} - +class_methods_t class_methods = { + .init = dht_init, + .fini = dht_fini, + .reconfigure = dht_reconfigure, + .notify = dht_notify +}; struct xlator_fops fops = { .lookup = dht_lookup, @@ -487,6 +50,7 @@ struct xlator_fops fops = { .access = dht_access, .readlink = dht_readlink, .getxattr = dht_getxattr, + .fgetxattr = dht_fgetxattr, .readv = dht_readv, .flush = dht_flush, .fsync = dht_fsync, @@ -495,6 +59,7 @@ struct xlator_fops fops = { .lk = dht_lk, /* Inode write operations */ + .fremovexattr = dht_fremovexattr, .removexattr = dht_removexattr, .setxattr = dht_setxattr, .fsetxattr = dht_fsetxattr, @@ -505,6 +70,9 @@ struct xlator_fops fops = { .fxattrop = dht_fxattrop, .setattr = dht_setattr, .fsetattr = dht_fsetattr, + .fallocate = dht_fallocate, + .discard = dht_discard, + .zerofill = dht_zerofill, }; struct xlator_dumpops dumpops = { @@ -518,38 +86,4 @@ struct xlator_cbks cbks = { // .releasedir = dht_releasedir, .forget = dht_forget }; - - -struct volume_options options[] = { - { .key = {"lookup-unhashed"}, - .value = {"auto", "yes", "no", "enable", "disable", "1", "0", - "on", "off"}, - .type = GF_OPTION_TYPE_STR, - .default_value = "on", - }, - { .key = {"min-free-disk"}, - .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, - .default_value = "10%", - .description = "Percentage/Size of disk space that must be " - "kept free." - }, - { .key = {"unhashed-sticky-bit"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - }, - { .key = {"use-readdirp"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "on", - }, - { .key = {"assert-no-child-down"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - }, - { .key = {"directory-layout-spread"}, - .type = GF_OPTION_TYPE_INT, - }, - { .key = {"decommissioned-bricks"}, - .type = GF_OPTION_TYPE_ANY, - }, - { .key = {NULL} }, -}; +; diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c index 9dcf224d1..e934acdf0 100644 --- a/xlators/cluster/dht/src/nufa.c +++ b/xlators/cluster/dht/src/nufa.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -27,6 +18,8 @@ /* TODO: all 'TODO's in dht.c holds good */ +extern struct volume_options options[]; + int nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, @@ -44,7 +37,6 @@ nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int call_cnt = 0; int ret = 0; - conf = this->private; prev = cookie; @@ -62,7 +54,8 @@ nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret == -1) goto out; - is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); is_dir = check_is_dir (inode, stbuf, xattr); if (!is_dir && !is_linkfile) { @@ -141,7 +134,7 @@ out: err: DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, - inode, stbuf, xattr, NULL); + inode, stbuf, xattr, postparent); return 0; } @@ -150,7 +143,6 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) { xlator_t *hashed_subvol = NULL; - xlator_t *cached_subvol = NULL; xlator_t *subvol = NULL; dht_local_t *local = NULL; dht_conf_t *conf = NULL; @@ -182,7 +174,6 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, } hashed_subvol = dht_subvol_get_hashed (this, &local->loc); - cached_subvol = local->cached_subvol; local->hashed_subvol = hashed_subvol; @@ -213,7 +204,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, * revalidates directly go to the cached-subvolume. */ ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, "Failed to set dict value."); @@ -234,7 +225,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, } else { do_fresh_lookup: ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, "Failed to set dict value."); @@ -243,7 +234,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, } ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht.linkto", 256); + conf->link_xattr_name, 256); if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, "Failed to set dict value."); @@ -262,7 +253,8 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, + NULL); return 0; } @@ -271,7 +263,7 @@ nufa_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; @@ -282,21 +274,21 @@ nufa_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, STACK_WIND (frame, dht_create_cbk, local->cached_subvol, local->cached_subvol->fops->create, - &local->loc, local->flags, local->mode, local->fd, - local->params); + &local->loc, local->flags, local->mode, local->umask, + local->fd, local->params); return 0; err: DHT_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL, NULL); return 0; } int nufa_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) + mode_t umask, fd_t *fd, dict_t *params) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; @@ -331,7 +323,8 @@ nufa_create (call_frame_t *frame, xlator_t *this, if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) { avail_subvol = dht_free_disk_available_subvol (this, - (xlator_t *)conf->private); + (xlator_t *)conf->private, + local); } if (subvol != avail_subvol) { @@ -339,11 +332,10 @@ nufa_create (call_frame_t *frame, xlator_t *this, local->params = dict_ref (params); local->mode = mode; local->flags = flags; - + local->umask = umask; local->cached_subvol = avail_subvol; - dht_linkfile_create (frame, - nufa_create_linkfile_create_cbk, - avail_subvol, subvol, loc); + dht_linkfile_create (frame, nufa_create_linkfile_create_cbk, + this, avail_subvol, subvol, loc); return 0; } @@ -352,14 +344,14 @@ nufa_create (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, dht_create_cbk, subvol, subvol->fops->create, - loc, flags, mode, fd, params); + loc, flags, mode, umask, fd, params); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL, NULL); return 0; } @@ -368,34 +360,39 @@ int nufa_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; local = frame->local; + if (!local || !local->cached_subvol) { + op_errno = EINVAL; + op_ret = -1; + goto err; + } if (op_ret >= 0) { - STACK_WIND (frame, dht_newfile_cbk, - local->cached_subvol, + STACK_WIND_COOKIE (frame, dht_newfile_cbk, + (void *)local->cached_subvol, local->cached_subvol, local->cached_subvol->fops->mknod, &local->loc, local->mode, local->rdev, - local->params); + local->umask, local->params); return 0; } - +err: WIPE (postparent); WIPE (preparent); DHT_STACK_UNWIND (link, frame, op_ret, op_errno, - inode, stbuf, preparent, postparent); + inode, stbuf, preparent, postparent, xdata); return 0; } int nufa_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, dict_t *params) + loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *params) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; @@ -431,7 +428,8 @@ nufa_mknod (call_frame_t *frame, xlator_t *this, if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) { avail_subvol = dht_free_disk_available_subvol (this, - (xlator_t *)conf->private); + (xlator_t *)conf->private, + local); } if (avail_subvol != subvol) { @@ -439,10 +437,11 @@ nufa_mknod (call_frame_t *frame, xlator_t *this, local->params = dict_ref (params); local->mode = mode; + local->umask = umask; local->rdev = rdev; local->cached_subvol = avail_subvol; - dht_linkfile_create (frame, nufa_mknod_linkfile_cbk, + dht_linkfile_create (frame, nufa_mknod_linkfile_cbk, this, avail_subvol, subvol, loc); return 0; } @@ -450,211 +449,185 @@ nufa_mknod (call_frame_t *frame, xlator_t *this, gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev, params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, rdev, umask, + params); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (mknod, frame, -1, op_errno, - NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); return 0; } -int -notify (xlator_t *this, int event, void *data, ...) +gf_boolean_t +same_first_part (char *str1, char term1, char *str2, char term2) { - int ret = -1; - - ret = dht_notify (this, event, data); - - return ret; + gf_boolean_t ended1; + gf_boolean_t ended2; + + for (;;) { + ended1 = ((*str1 == '\0') || (*str1 == term1)); + ended2 = ((*str2 == '\0') || (*str2 == term2)); + if (ended1 && ended2) { + return _gf_true; + } + if (ended1 || ended2 || (*str1 != *str2)) { + return _gf_false; + } + ++str1; + ++str2; + } } -void -fini (xlator_t *this) -{ - int i = 0; - dht_conf_t *conf = NULL; +typedef struct nufa_args { + xlator_t *this; + char *volname; + gf_boolean_t addr_match; +} nufa_args_t; - conf = this->private; +static void +nufa_find_local_brick (xlator_t *xl, void *data) +{ + nufa_args_t *args = data; + xlator_t *this = args->this; + char *local_volname = args->volname; + gf_boolean_t addr_match = args->addr_match; + char *brick_host = NULL; + dht_conf_t *conf = this->private; + int ret = -1; + + /*This means a local subvol was already found. We pick the first brick + * that is local*/ + if (conf->private) + return; + + if (strcmp (xl->name, local_volname) == 0) { + conf->private = xl; + gf_log (this->name, GF_LOG_INFO, "Using specified subvol %s", + local_volname); + return; + } - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } + if (!addr_match) + return; - if (conf->subvolumes) - GF_FREE (conf->subvolumes); + ret = dict_get_str (xl->options, "remote-host", &brick_host); + if ((ret == 0) && + (gf_is_same_address (local_volname, brick_host) || + gf_is_local_addr (brick_host))) { + conf->private = xl; + gf_log (this->name, GF_LOG_INFO, "Using the first local " + "subvol %s", xl->name); + return; + } - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); +} - GF_FREE (conf); - } +static void +nufa_to_dht (xlator_t *this) +{ + GF_ASSERT (this); + GF_ASSERT (this->fops); - return; + this->fops->lookup = dht_lookup; + this->fops->create = dht_create; + this->fops->mknod = dht_mknod; } int -init (xlator_t *this) +nufa_find_local_subvol (xlator_t *this, + void (*fn) (xlator_t *each, void* data), void *data) { - dht_conf_t *conf = NULL; - xlator_list_t *trav = NULL; - data_t *data = NULL; - char *local_volname = NULL; - char *temp_str = NULL; - int ret = -1; - int i = 0; - char my_hostname[256]; - uint32_t temp_free_disk = 0; - - if (!this->children) { - gf_log (this->name, GF_LOG_CRITICAL, - "NUFA needs more than one subvolume"); + int ret = -1; + dht_conf_t *conf = this->private; + xlator_list_t *trav = NULL; + xlator_t *parent = NULL; + xlator_t *candidate = NULL; + + xlator_foreach_depth_first (this, fn, data); + if (!conf->private) { + gf_log (this->name, GF_LOG_ERROR, "Couldn't find a local " + "brick"); return -1; } - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile"); - } - - conf = GF_CALLOC (1, sizeof (*conf), - gf_dht_mt_dht_conf_t); - if (!conf) { - goto err; - } - - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; - if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean */ - if (strcasecmp (temp_str, "auto")) - gf_string2boolean (temp_str, &conf->search_unhashed); - else - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; - } + candidate = conf->private; + trav = candidate->parents; + while (trav) { - ret = dht_init_subvolumes (this, conf); - if (ret == -1) { - goto err; - } + parent = trav->xlator; + if (strcmp (parent->type, "cluster/nufa") == 0) { + gf_log (this->name, GF_LOG_INFO, "Found local subvol, " + "%s", candidate->name); + ret = 0; + conf->private = candidate; + break; + } - ret = dht_layouts_init (this, conf); - if (ret == -1) { - goto err; + candidate = parent; + trav = parent->parents; } - LOCK_INIT (&conf->subvolume_lock); - LOCK_INIT (&conf->layout_lock); + return ret; +} - conf->gen = 1; +int +nufa_init (xlator_t *this) +{ + data_t *data = NULL; + char *local_volname = NULL; + int ret = -1; + char my_hostname[256]; + gf_boolean_t addr_match = _gf_false; + nufa_args_t args = {0, }; - local_volname = "localhost"; - ret = gethostname (my_hostname, 256); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "could not find hostname (%s)", - strerror (errno)); + ret = dht_init(this); + if (ret) { + return ret; } - if (ret == 0) - local_volname = my_hostname; - - data = dict_get (this->options, "local-volume-name"); - if (data) { + if ((data = dict_get (this->options, "local-volume-name"))) { local_volname = data->data; - } - trav = this->children; - while (trav) { - if (strcmp (trav->xlator->name, local_volname) == 0) - break; - trav = trav->next; - } + } else { + addr_match = _gf_true; + local_volname = "localhost"; + ret = gethostname (my_hostname, 256); + if (ret == 0) + local_volname = my_hostname; - if (!trav) { - gf_log (this->name, GF_LOG_ERROR, - "Could not find subvolume named '%s'. " - "Please define volume with the name as the hostname " - "or override it with 'option local-volume-name'", - local_volname); - goto err; - } - /* The volume specified exists */ - conf->private = trav->xlator; - - conf->min_free_disk = 10; - conf->disk_unit = 'p'; - - if (dict_get_str (this->options, "min-free-disk", - &temp_str) == 0) { - if (gf_string2percent (temp_str, - &temp_free_disk) == 0) { - if (temp_free_disk > 100) { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } else { - conf->min_free_disk = (uint64_t)temp_free_disk; - conf->disk_unit = 'p'; - } - } else { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } - } + else + gf_log (this->name, GF_LOG_WARNING, + "could not find hostname (%s)", + strerror (errno)); - conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t), - gf_dht_mt_dht_du_t); - if (!conf->du_stats) { - goto err; } - /* Create 'syncop' environment */ - conf->env = syncenv_new (0); - if (!conf->env) { - gf_log (this->name, GF_LOG_ERROR, - "failed to create sync environment %s", - strerror (errno)); - goto err; + args.this = this; + args.volname = local_volname; + args.addr_match = addr_match; + ret = nufa_find_local_subvol (this, nufa_find_local_brick, &args); + if (ret) { + gf_log (this->name, GF_LOG_INFO, + "Unable to find local subvolume, switching " + "to dht mode"); + nufa_to_dht (this); } - - this->private = conf; - return 0; +} -err: - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - if (conf->subvolumes) - GF_FREE (conf->subvolumes); - - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); - - if (conf->du_stats) - GF_FREE (conf->du_stats); - - GF_FREE (conf); - } - return -1; -} +class_methods_t class_methods = { + .init = nufa_init, + .fini = dht_fini, + .reconfigure = dht_reconfigure, + .notify = dht_notify +}; struct xlator_fops fops = { @@ -701,19 +674,3 @@ struct xlator_fops fops = { struct xlator_cbks cbks = { .forget = dht_forget }; - - -struct volume_options options[] = { - { .key = {"lookup-unhashed"}, - .value = {"auto", "yes", "no", "enable", "disable", "1", "0", - "on", "off"}, - .type = GF_OPTION_TYPE_STR - }, - { .key = {"local-volume-name"}, - .type = GF_OPTION_TYPE_XLATOR - }, - { .key = {"min-free-disk"}, - .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, - }, - { .key = {NULL} }, -}; diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c index fd3f22ea0..d3ea90ba8 100644 --- a/xlators/cluster/dht/src/switch.c +++ b/xlators/cluster/dht/src/switch.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -31,6 +22,8 @@ #include <fnmatch.h> #include <string.h> +extern struct volume_options options[]; + struct switch_sched_array { xlator_t *xl; int32_t eligible; @@ -76,29 +69,37 @@ get_switch_matching_subvol (const char *path, dht_conf_t *conf, struct switch_struct *cond = NULL; struct switch_struct *trav = NULL; char *pathname = NULL; - int idx = 0; + int idx = 0; + xlator_t *subvol = NULL; cond = conf->private; + subvol = hashed_subvol; if (!cond) - return hashed_subvol; + goto out; - trav = cond; pathname = gf_strdup (path); + if (!pathname) + goto out; + + trav = cond; while (trav) { if (fnmatch (trav->path_pattern, pathname, FNM_NOESCAPE) == 0) { for (idx = 0; idx < trav->num_child; idx++) { if (trav->array[idx].xl == hashed_subvol) - return hashed_subvol; + goto out; } idx = trav->node_index++; trav->node_index %= trav->num_child; - return trav->array[idx].xl; + subvol = trav->array[idx].xl; + goto out; } trav = trav->next; } +out: GF_FREE (pathname); - return hashed_subvol; + + return subvol; } @@ -136,7 +137,8 @@ switch_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret == -1) goto out; - is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); is_dir = check_is_dir (inode, stbuf, xattr); if (!is_dir && !is_linkfile) { @@ -290,11 +292,11 @@ switch_lookup (call_frame_t *frame, xlator_t *this, * attribute, revalidates directly go to the cached-subvolume. */ ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); if (ret < 0) gf_log (this->name, GF_LOG_WARNING, - "failed to set dict value for " - "trusted.glusterfs.dht"); + "failed to set dict value for %s", + conf->xattr_name); for (i = 0; i < layout->cnt; i++) { subvol = layout->list[i].xlator; @@ -309,18 +311,18 @@ switch_lookup (call_frame_t *frame, xlator_t *this, } else { do_fresh_lookup: ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); if (ret < 0) gf_log (this->name, GF_LOG_WARNING, - "failed to set dict value for " - "trusted.glusterfs.dht"); + "failed to set dict value for %s", + conf->xattr_name); ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht.linkto", 256); + conf->link_xattr_name, 256); if (ret < 0) gf_log (this->name, GF_LOG_WARNING, - "failed to set dict value for " - "trusted.glusterfs.dht.linkto"); + "failed to set dict value for %s", + conf->link_xattr_name); if (!hashed_subvol) { gf_log (this->name, GF_LOG_DEBUG, @@ -366,7 +368,8 @@ switch_lookup (call_frame_t *frame, xlator_t *this, err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (lookup, frame, -1, op_errno, + NULL, NULL, NULL, NULL); return 0; } @@ -375,7 +378,7 @@ switch_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; @@ -386,21 +389,21 @@ switch_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, STACK_WIND (frame, dht_create_cbk, local->cached_subvol, local->cached_subvol->fops->create, - &local->loc, local->flags, local->mode, local->fd, - local->params); + &local->loc, local->flags, local->mode, local->umask, + local->fd, local->params); return 0; err: DHT_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL, NULL); return 0; } int switch_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) + mode_t umask, fd_t *fd, dict_t *params) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; @@ -434,18 +437,18 @@ switch_create (call_frame_t *frame, xlator_t *this, avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol); if (dht_is_subvol_filled (this, avail_subvol)) { avail_subvol = - dht_free_disk_available_subvol (this, avail_subvol); + dht_free_disk_available_subvol (this, avail_subvol, + local); } if (subvol != avail_subvol) { /* create a link file instead of actual file */ local->mode = mode; local->flags = flags; - + local->umask = umask; local->cached_subvol = avail_subvol; - dht_linkfile_create (frame, - switch_create_linkfile_create_cbk, - avail_subvol, subvol, loc); + dht_linkfile_create (frame, switch_create_linkfile_create_cbk, + this, avail_subvol, subvol, loc); return 0; } @@ -454,14 +457,14 @@ switch_create (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, dht_create_cbk, subvol, subvol->fops->create, - loc, flags, mode, fd, params); + loc, flags, mode, umask, fd, params); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL, NULL); return 0; } @@ -470,31 +473,36 @@ int switch_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; local = frame->local; + if (!local || !local->cached_subvol) { + op_errno = EINVAL; + op_ret = -1; + goto err; + } if (op_ret >= 0) { - STACK_WIND (frame, dht_newfile_cbk, - local->cached_subvol, + STACK_WIND_COOKIE (frame, dht_newfile_cbk, + (void *)local->cached_subvol, local->cached_subvol, local->cached_subvol->fops->mknod, &local->loc, local->mode, local->rdev, - local->params); + local->umask, local->params); return 0; } - +err: DHT_STACK_UNWIND (link, frame, op_ret, op_errno, - inode, stbuf, preparent, postparent); + inode, stbuf, preparent, postparent, xdata); return 0; } int -switch_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, dict_t *params) +switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *params) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; @@ -529,7 +537,8 @@ switch_mknod (call_frame_t *frame, xlator_t *this, avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol); if (dht_is_subvol_filled (this, avail_subvol)) { avail_subvol = - dht_free_disk_available_subvol (this, avail_subvol); + dht_free_disk_available_subvol (this, avail_subvol, + local); } if (avail_subvol != subvol) { @@ -537,46 +546,36 @@ switch_mknod (call_frame_t *frame, xlator_t *this, local->params = dict_ref (params); local->mode = mode; + local->umask = umask; local->rdev = rdev; local->cached_subvol = avail_subvol; dht_linkfile_create (frame, switch_mknod_linkfile_cbk, - avail_subvol, subvol, loc); + this, avail_subvol, subvol, loc); return 0; } gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev, params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, rdev, umask, + params); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (mknod, frame, -1, op_errno, - NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); return 0; } -int -notify (xlator_t *this, int event, void *data, ...) -{ - int ret = -1; - - ret = dht_notify (this, event, data); - - return ret; -} - void -fini (xlator_t *this) +switch_fini (xlator_t *this) { - int i = 0; dht_conf_t *conf = NULL; struct switch_struct *trav = NULL; struct switch_struct *prev = NULL; @@ -587,30 +586,14 @@ fini (xlator_t *this) trav = (struct switch_struct *)conf->private; conf->private = NULL; while (trav) { - if (trav->array) - GF_FREE (trav->array); + GF_FREE (trav->array); prev = trav; trav = trav->next; GF_FREE (prev); } - - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - if (conf->subvolumes) - GF_FREE (conf->subvolumes); - - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); - - GF_FREE (conf); } - return; + dht_fini(this); } int @@ -670,8 +653,10 @@ set_switch_pattern (xlator_t *this, dht_conf_t *conf, dup_str = gf_strdup (switch_str); switch_opt = GF_CALLOC (1, sizeof (struct switch_struct), gf_switch_mt_switch_struct); - if (!switch_opt) + if (!switch_opt) { + GF_FREE (dup_str); goto err; + } pattern = strtok_r (dup_str, ":", &tmp_str1); childs = strtok_r (NULL, ":", &tmp_str1); @@ -681,6 +666,7 @@ set_switch_pattern (xlator_t *this, dht_conf_t *conf, "for all the unconfigured child nodes," " hence neglecting current option"); switch_str = strtok_r (NULL, ";", &tmp_str); + GF_FREE (switch_opt); GF_FREE (dup_str); continue; } @@ -753,6 +739,7 @@ set_switch_pattern (xlator_t *this, dht_conf_t *conf, /* First entry */ switch_buf = switch_opt; } + switch_opt = NULL; switch_str = strtok_r (NULL, ";", &tmp_str); } @@ -809,19 +796,20 @@ set_switch_pattern (xlator_t *this, dht_conf_t *conf, /* First entry */ switch_buf = switch_opt; } + switch_opt = NULL; } /* */ conf->private = switch_buf; return 0; err: + GF_FREE (switch_buf_array); + GF_FREE (switch_opt); + if (switch_buf) { - if (switch_buf_array) - GF_FREE (switch_buf_array); trav = switch_buf; while (trav) { - if (trav->array) - GF_FREE (trav->array); + GF_FREE (trav->array); switch_opt = trav; trav = trav->next; GF_FREE (switch_opt); @@ -831,68 +819,18 @@ err: } -int -init (xlator_t *this) +int32_t +switch_init (xlator_t *this) { dht_conf_t *conf = NULL; data_t *data = NULL; - char *temp_str = NULL; int ret = -1; - int i = 0; - uint32_t temp_free_disk = 0; - - if (!this->children) { - gf_log (this->name, GF_LOG_CRITICAL, - "SWITCH needs more than one subvolume"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile"); - } - - conf = GF_CALLOC (1, sizeof (*conf), gf_switch_mt_dht_conf_t); - if (!conf) { - goto err; - } - - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; - if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean */ - if (strcasecmp (temp_str, "auto")) - gf_string2boolean (temp_str, &conf->search_unhashed); - else - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; - } - conf->unhashed_sticky_bit = 0; - if (dict_get_str (this->options, "unhashed-sticky-bit", - &temp_str) == 0) { - gf_string2boolean (temp_str, &conf->unhashed_sticky_bit); - } - - conf->min_free_disk = 10; - conf->disk_unit = 'p'; - - if (dict_get_str (this->options, "min-free-disk", - &temp_str) == 0) { - if (gf_string2percent (temp_str, - &temp_free_disk) == 0) { - if (temp_free_disk > 100) { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } else { - conf->min_free_disk = (uint64_t)temp_free_disk; - conf->disk_unit = 'p'; - } - } else { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } + ret = dht_init(this); + if (ret) { + return ret; } + conf = this->private; data = dict_get (this->options, "pattern.switch.case"); if (data) { @@ -903,65 +841,23 @@ init (xlator_t *this) } } - ret = dht_init_subvolumes (this, conf); - if (ret == -1) { - goto err; - } - - ret = dht_layouts_init (this, conf); - if (ret == -1) { - goto err; - } - - LOCK_INIT (&conf->subvolume_lock); - LOCK_INIT (&conf->layout_lock); - - conf->gen = 1; - - conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t), - gf_switch_mt_dht_du_t); - if (!conf->du_stats) { - goto err; - } - - /* Create 'syncop' environment */ - conf->env = syncenv_new (0); - if (!conf->env) { - gf_log (this->name, GF_LOG_ERROR, - "failed to create sync environment %s", - strerror (errno)); - goto err; - } - this->private = conf; - return 0; err: - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - if (conf->subvolumes) - GF_FREE (conf->subvolumes); - - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); - - if (conf->du_stats) - GF_FREE (conf->du_stats); - - GF_FREE (conf); - } - + dht_fini(this); return -1; } +class_methods_t class_methods = { + .init = switch_init, + .fini = switch_fini, + .reconfigure = dht_reconfigure, + .notify = dht_notify +}; + + struct xlator_fops fops = { .lookup = switch_lookup, .create = switch_create, @@ -1006,19 +902,3 @@ struct xlator_fops fops = { struct xlator_cbks cbks = { .forget = dht_forget }; - - -struct volume_options options[] = { - { .key = {"lookup-unhashed"}, - .value = {"auto", "yes", "no", "enable", "disable", "1", "0", - "on", "off"}, - .type = GF_OPTION_TYPE_STR - }, - { .key = {"pattern.switch.case"}, - .type = GF_OPTION_TYPE_ANY - }, - { .key = {"min-free-disk"}, - .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, - }, - { .key = {NULL} }, -}; diff --git a/xlators/cluster/ha/src/Makefile.am b/xlators/cluster/ha/src/Makefile.am index 5f78a2965..5c1364b7f 100644 --- a/xlators/cluster/ha/src/Makefile.am +++ b/xlators/cluster/ha/src/Makefile.am @@ -1,15 +1,16 @@ xlator_LTLIBRARIES = ha.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/cluster -ha_la_LDFLAGS = -module -avoidversion +ha_la_LDFLAGS = -module -avoid-version ha_la_SOURCES = ha-helpers.c ha.c ha_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la noinst_HEADERS = ha.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/cluster/ha/src/ha-helpers.c b/xlators/cluster/ha/src/ha-helpers.c index 1e4af1b62..19be1ed27 100644 --- a/xlators/cluster/ha/src/ha-helpers.c +++ b/xlators/cluster/ha/src/ha-helpers.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #include "xlator.h" #include "call-stub.h" #include "defaults.h" diff --git a/xlators/cluster/ha/src/ha-mem-types.h b/xlators/cluster/ha/src/ha-mem-types.h index 9bfb3972b..e5e97d237 100644 --- a/xlators/cluster/ha/src/ha-mem-types.h +++ b/xlators/cluster/ha/src/ha-mem-types.h @@ -1,24 +1,13 @@ - /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef __HA_MEM_TYPES_H__ #define __HA_MEM_TYPES_H__ diff --git a/xlators/cluster/ha/src/ha.c b/xlators/cluster/ha/src/ha.c index 38d4229d3..3eccb516b 100644 --- a/xlators/cluster/ha/src/ha.c +++ b/xlators/cluster/ha/src/ha.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ /* generate errors randomly, code is simple now, better alogorithm * can be written to decide what error to be returned and when */ @@ -1876,13 +1866,9 @@ err: } if (hafdp) { - if (hafdp->fdstate) { - GF_FREE (hafdp->fdstate); - } + GF_FREE (hafdp->fdstate); - if (hafdp->path) { - GF_FREE (hafdp->path); - } + GF_FREE (hafdp->path); GF_FREE (hafdp); } diff --git a/xlators/cluster/ha/src/ha.h b/xlators/cluster/ha/src/ha.h index 39b6851e7..e2ed7eaa6 100644 --- a/xlators/cluster/ha/src/ha.h +++ b/xlators/cluster/ha/src/ha.h @@ -1,22 +1,12 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #ifndef __HA_H_ #define __HA_H_ diff --git a/xlators/cluster/map/src/Makefile.am b/xlators/cluster/map/src/Makefile.am index 26e19137a..a278b05e2 100644 --- a/xlators/cluster/map/src/Makefile.am +++ b/xlators/cluster/map/src/Makefile.am @@ -1,15 +1,16 @@ xlator_LTLIBRARIES = map.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/cluster -map_la_LDFLAGS = -module -avoidversion +map_la_LDFLAGS = -module -avoid-version map_la_SOURCES = map.c map-helper.c map_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la noinst_HEADERS = map.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/cluster/map/src/map-helper.c b/xlators/cluster/map/src/map-helper.c index 81212fcfd..851397b68 100644 --- a/xlators/cluster/map/src/map-helper.c +++ b/xlators/cluster/map/src/map-helper.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2009-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2009-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" diff --git a/xlators/cluster/map/src/map-mem-types.h b/xlators/cluster/map/src/map-mem-types.h index 669b93dc2..3e89f4736 100644 --- a/xlators/cluster/map/src/map-mem-types.h +++ b/xlators/cluster/map/src/map-mem-types.h @@ -1,24 +1,13 @@ - /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef __MAP_MEM_TYPES_H__ #define __MAP_MEM_TYPES_H__ diff --git a/xlators/cluster/map/src/map.c b/xlators/cluster/map/src/map.c index ead9da0b9..6150a33ce 100644 --- a/xlators/cluster/map/src/map.c +++ b/xlators/cluster/map/src/map.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" @@ -2375,8 +2365,7 @@ fini (xlator_t *this) priv = this->private; if (priv) { - if (priv->xlarray) - GF_FREE (priv->xlarray); + GF_FREE (priv->xlarray); trav_map = priv->map; while (trav_map) { diff --git a/xlators/cluster/map/src/map.h b/xlators/cluster/map/src/map.h index bccac437c..7703a543e 100644 --- a/xlators/cluster/map/src/map.h +++ b/xlators/cluster/map/src/map.h @@ -1,22 +1,12 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef __MAP_H__ #define __MAP_H__ diff --git a/xlators/cluster/stripe/src/Makefile.am b/xlators/cluster/stripe/src/Makefile.am index 0db3c9eeb..2d151422a 100644 --- a/xlators/cluster/stripe/src/Makefile.am +++ b/xlators/cluster/stripe/src/Makefile.am @@ -2,16 +2,19 @@ xlator_LTLIBRARIES = stripe.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster -stripe_la_LDFLAGS = -module -avoidversion +stripe_la_LDFLAGS = -module -avoid-version + +stripe_la_SOURCES = stripe.c stripe-helpers.c \ + $(top_builddir)/xlators/lib/src/libxlator.c -stripe_la_SOURCES = stripe.c $(top_builddir)/xlators/lib/src/libxlator.c stripe_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la noinst_HEADERS = stripe.h stripe-mem-types.h $(top_builddir)/xlators/lib/src/libxlator.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) \ +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ -I$(top_srcdir)/xlators/lib/src +AM_CFLAGS = -Wall $(GF_CFLAGS) + CLEANFILES = diff --git a/xlators/cluster/stripe/src/stripe-helpers.c b/xlators/cluster/stripe/src/stripe-helpers.c new file mode 100644 index 000000000..a83abdc72 --- /dev/null +++ b/xlators/cluster/stripe/src/stripe-helpers.c @@ -0,0 +1,675 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <fnmatch.h> + +#include "stripe.h" +#include "byte-order.h" +#include "mem-types.h" + +void +stripe_local_wipe (stripe_local_t *local) +{ + if (!local) + goto out; + + loc_wipe (&local->loc); + loc_wipe (&local->loc2); + + if (local->fd) + fd_unref (local->fd); + + if (local->inode) + inode_unref (local->inode); + + if (local->xattr) + dict_unref (local->xattr); + + if (local->xdata) + dict_unref (local->xdata); + +out: + return; +} + + + +int +stripe_aggregate (dict_t *this, char *key, data_t *value, void *data) +{ + dict_t *dst = NULL; + int64_t *ptr = 0, *size = NULL; + int32_t ret = -1; + + dst = data; + + if (strcmp (key, GF_XATTR_QUOTA_SIZE_KEY) == 0) { + ret = dict_get_bin (dst, key, (void **)&size); + if (ret < 0) { + size = GF_CALLOC (1, sizeof (int64_t), + gf_common_mt_char); + if (size == NULL) { + gf_log ("stripe", GF_LOG_WARNING, + "memory allocation failed"); + goto out; + } + ret = dict_set_bin (dst, key, size, sizeof (int64_t)); + if (ret < 0) { + gf_log ("stripe", GF_LOG_WARNING, + "stripe aggregate dict set failed"); + GF_FREE (size); + goto out; + } + } + + ptr = data_to_bin (value); + if (ptr == NULL) { + gf_log ("stripe", GF_LOG_WARNING, "data to bin failed"); + goto out; + } + + *size = hton64 (ntoh64 (*size) + ntoh64 (*ptr)); + } else if (strcmp (key, GF_CONTENT_KEY)) { + /* No need to aggregate 'CONTENT' data */ + ret = dict_set (dst, key, value); + if (ret) + gf_log ("stripe", GF_LOG_WARNING, "xattr dict set failed"); + } + +out: + return 0; +} + + +void +stripe_aggregate_xattr (dict_t *dst, dict_t *src) +{ + if ((dst == NULL) || (src == NULL)) { + goto out; + } + + dict_foreach (src, stripe_aggregate, dst); +out: + return; +} + + +int32_t +stripe_xattr_aggregate (char *buffer, stripe_local_t *local, int32_t *total) +{ + int32_t i = 0; + int32_t ret = -1; + int32_t len = 0; + char *sbuf = NULL; + stripe_xattr_sort_t *xattr = NULL; + + if (!buffer || !local || !local->xattr_list) + goto out; + + sbuf = buffer; + + for (i = 0; i < local->nallocs; i++) { + xattr = local->xattr_list + i; + len = xattr->xattr_len; + + if (len && xattr && xattr->xattr_value) { + memcpy (buffer, xattr->xattr_value, len); + buffer += len; + *buffer++ = ' '; + } + } + + *--buffer = '\0'; + if (total) + *total = buffer - sbuf; + ret = 0; + + out: + return ret; +} + +int32_t +stripe_free_xattr_str (stripe_local_t *local) +{ + int32_t i = 0; + int32_t ret = -1; + stripe_xattr_sort_t *xattr = NULL; + + if (!local || !local->xattr_list) + goto out; + + for (i = 0; i < local->nallocs; i++) { + xattr = local->xattr_list + i; + + if (xattr && xattr->xattr_value) + GF_FREE (xattr->xattr_value); + } + + ret = 0; + out: + return ret; +} + + +int32_t +stripe_fill_lockinfo_xattr (xlator_t *this, stripe_local_t *local, + void **xattr_serz) +{ + int32_t ret = -1, i = 0, len = 0; + dict_t *tmp1 = NULL, *tmp2 = NULL; + char *buf = NULL; + stripe_xattr_sort_t *xattr = NULL; + + if (xattr_serz == NULL) { + goto out; + } + + tmp2 = dict_new (); + + if (tmp2 == NULL) { + goto out; + } + + for (i = 0; i < local->nallocs; i++) { + xattr = local->xattr_list + i; + len = xattr->xattr_len; + + if (len && xattr && xattr->xattr_value) { + ret = dict_reset (tmp2); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "dict_reset failed (%s)", + strerror (-ret)); + } + + ret = dict_unserialize (xattr->xattr_value, + xattr->xattr_len, + &tmp2); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "dict_unserialize failed (%s)", + strerror (-ret)); + ret = -1; + goto out; + } + + tmp1 = dict_copy (tmp2, tmp1); + if (tmp1 == NULL) { + gf_log (this->name, GF_LOG_WARNING, + "dict_copy failed (%s)", + strerror (-ret)); + ret = -1; + goto out; + } + } + } + + len = dict_serialized_length (tmp1); + if (len > 0) { + buf = GF_CALLOC (1, len, gf_common_mt_dict_t); + if (buf == NULL) { + ret = -1; + goto out; + } + + ret = dict_serialize (tmp1, buf); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "dict_serialize failed (%s)", strerror (-ret)); + ret = -1; + goto out; + } + + *xattr_serz = buf; + } + + ret = 0; +out: + if (tmp1 != NULL) { + dict_unref (tmp1); + } + + if (tmp2 != NULL) { + dict_unref (tmp2); + } + + return ret; +} + + +int32_t +stripe_fill_pathinfo_xattr (xlator_t *this, stripe_local_t *local, + char **xattr_serz) +{ + int ret = -1; + int32_t padding = 0; + int32_t tlen = 0; + char stripe_size_str[20] = {0,}; + char *pathinfo_serz = NULL; + + if (!local) { + gf_log (this->name, GF_LOG_ERROR, "Possible NULL deref"); + goto out; + } + + (void) snprintf (stripe_size_str, 20, "%ld", + (local->fctx) ? local->fctx->stripe_size : 0); + + /* extra bytes for decorations (brackets and <>'s) */ + padding = strlen (this->name) + strlen (STRIPE_PATHINFO_HEADER) + + strlen (stripe_size_str) + 7; + local->xattr_total_len += (padding + 2); + + pathinfo_serz = GF_CALLOC (local->xattr_total_len, sizeof (char), + gf_common_mt_char); + if (!pathinfo_serz) + goto out; + + /* xlator info */ + (void) sprintf (pathinfo_serz, "(<"STRIPE_PATHINFO_HEADER"%s:[%s]> ", + this->name, stripe_size_str); + + ret = stripe_xattr_aggregate (pathinfo_serz + padding, local, &tlen); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Cannot aggregate pathinfo list"); + goto out; + } + + *(pathinfo_serz + padding + tlen) = ')'; + *(pathinfo_serz + padding + tlen + 1) = '\0'; + + *xattr_serz = pathinfo_serz; + + ret = 0; + out: + return ret; +} + +/** + * stripe_get_matching_bs - Get the matching block size for the given path. + */ +int32_t +stripe_get_matching_bs (const char *path, stripe_private_t *priv) +{ + struct stripe_options *trav = NULL; + uint64_t block_size = 0; + + GF_VALIDATE_OR_GOTO ("stripe", priv, out); + GF_VALIDATE_OR_GOTO ("stripe", path, out); + + LOCK (&priv->lock); + { + block_size = priv->block_size; + trav = priv->pattern; + while (trav) { + if (!fnmatch (trav->path_pattern, path, FNM_NOESCAPE)) { + block_size = trav->block_size; + break; + } + trav = trav->next; + } + } + UNLOCK (&priv->lock); + +out: + return block_size; +} + +int32_t +stripe_ctx_handle (xlator_t *this, call_frame_t *prev, stripe_local_t *local, + dict_t *dict) +{ + char key[256] = {0,}; + data_t *data = NULL; + int32_t index = 0; + stripe_private_t *priv = NULL; + + priv = this->private; + + + if (!local->fctx) { + local->fctx = GF_CALLOC (1, sizeof (stripe_fd_ctx_t), + gf_stripe_mt_stripe_fd_ctx_t); + if (!local->fctx) { + local->op_errno = ENOMEM; + local->op_ret = -1; + goto out; + } + + local->fctx->static_array = 0; + } + /* Stripe block size */ + sprintf (key, "trusted.%s.stripe-size", this->name); + data = dict_get (dict, key); + if (!data) { + local->xattr_self_heal_needed = 1; + gf_log (this->name, GF_LOG_ERROR, + "Failed to get stripe-size"); + goto out; + } else { + if (!local->fctx->stripe_size) { + local->fctx->stripe_size = + data_to_int64 (data); + } + + if (local->fctx->stripe_size != data_to_int64 (data)) { + gf_log (this->name, GF_LOG_WARNING, + "stripe-size mismatch in blocks"); + local->xattr_self_heal_needed = 1; + } + } + + /* Stripe count */ + sprintf (key, "trusted.%s.stripe-count", this->name); + data = dict_get (dict, key); + + if (!data) { + local->xattr_self_heal_needed = 1; + gf_log (this->name, GF_LOG_ERROR, + "Failed to get stripe-count"); + goto out; + } + if (!local->fctx->xl_array) { + local->fctx->stripe_count = data_to_int32 (data); + if (!local->fctx->stripe_count) { + gf_log (this->name, GF_LOG_ERROR, + "error with stripe-count xattr"); + local->op_ret = -1; + local->op_errno = EIO; + goto out; + } + + local->fctx->xl_array = GF_CALLOC (local->fctx->stripe_count, + sizeof (xlator_t *), + gf_stripe_mt_xlator_t); + + if (!local->fctx->xl_array) { + local->op_errno = ENOMEM; + local->op_ret = -1; + goto out; + } + } + if (local->fctx->stripe_count != data_to_int32 (data)) { + gf_log (this->name, GF_LOG_ERROR, + "error with stripe-count xattr (%d != %d)", + local->fctx->stripe_count, data_to_int32 (data)); + local->op_ret = -1; + local->op_errno = EIO; + goto out; + } + + /* index */ + sprintf (key, "trusted.%s.stripe-index", this->name); + data = dict_get (dict, key); + if (!data) { + local->xattr_self_heal_needed = 1; + gf_log (this->name, GF_LOG_ERROR, + "Failed to get stripe-index"); + goto out; + } + index = data_to_int32 (data); + if (index > priv->child_count) { + gf_log (this->name, GF_LOG_ERROR, + "error with stripe-index xattr (%d)", index); + local->op_ret = -1; + local->op_errno = EIO; + goto out; + } + if (local->fctx->xl_array) { + if (!local->fctx->xl_array[index]) + local->fctx->xl_array[index] = prev->this; + } + + sprintf(key, "trusted.%s.stripe-coalesce", this->name); + data = dict_get(dict, key); + if (!data) { + /* + * The file was probably created prior to coalesce support. + * Assume non-coalesce mode for this file to maintain backwards + * compatibility. + */ + gf_log(this->name, GF_LOG_DEBUG, "missing stripe-coalesce " + "attr, assume non-coalesce mode"); + local->fctx->stripe_coalesce = 0; + } else { + local->fctx->stripe_coalesce = data_to_int32(data); + } + + +out: + return 0; +} + +int32_t +stripe_xattr_request_build (xlator_t *this, dict_t *dict, uint64_t stripe_size, + uint32_t stripe_count, uint32_t stripe_index, + uint32_t stripe_coalesce) +{ + char key[256] = {0,}; + int32_t ret = -1; + + sprintf (key, "trusted.%s.stripe-size", this->name); + ret = dict_set_int64 (dict, key, stripe_size); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set %s in xattr_req dict", key); + goto out; + } + + sprintf (key, "trusted.%s.stripe-count", this->name); + ret = dict_set_int32 (dict, key, stripe_count); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set %s in xattr_req dict", key); + goto out; + } + + sprintf (key, "trusted.%s.stripe-index", this->name); + ret = dict_set_int32 (dict, key, stripe_index); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set %s in xattr_req dict", key); + goto out; + } + + sprintf(key, "trusted.%s.stripe-coalesce", this->name); + ret = dict_set_int32(dict, key, stripe_coalesce); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, + "failed to set %s in xattr_req_dict", key); + goto out; + } +out: + return ret; +} + + +static int +set_default_block_size (stripe_private_t *priv, char *num) +{ + + int ret = -1; + GF_VALIDATE_OR_GOTO ("stripe", THIS, out); + GF_VALIDATE_OR_GOTO (THIS->name, priv, out); + GF_VALIDATE_OR_GOTO (THIS->name, num, out); + + + if (gf_string2bytesize (num, &priv->block_size) != 0) { + gf_log (THIS->name, GF_LOG_ERROR, + "invalid number format \"%s\"", num); + goto out; + } + + ret = 0; + + out: + return ret; + +} + + +int +set_stripe_block_size (xlator_t *this, stripe_private_t *priv, char *data) +{ + int ret = -1; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *dup_str = NULL; + char *stripe_str = NULL; + char *pattern = NULL; + char *num = NULL; + struct stripe_options *temp_stripeopt = NULL; + struct stripe_options *stripe_opt = NULL; + + if (!this || !priv || !data) + goto out; + + /* Get the pattern for striping. + "option block-size *avi:10MB" etc */ + stripe_str = strtok_r (data, ",", &tmp_str); + while (stripe_str) { + dup_str = gf_strdup (stripe_str); + stripe_opt = GF_CALLOC (1, sizeof (struct stripe_options), + gf_stripe_mt_stripe_options); + if (!stripe_opt) { + goto out; + } + + pattern = strtok_r (dup_str, ":", &tmp_str1); + num = strtok_r (NULL, ":", &tmp_str1); + if (!num) { + num = pattern; + pattern = "*"; + ret = set_default_block_size (priv, num); + if (ret) + goto out; + } + if (gf_string2bytesize (num, &stripe_opt->block_size) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", num); + goto out; + } + + if (stripe_opt->block_size < STRIPE_MIN_BLOCK_SIZE) { + gf_log (this->name, GF_LOG_ERROR, "Invalid Block-size: " + "%s. Should be atleast %llu bytes", num, + STRIPE_MIN_BLOCK_SIZE); + goto out; + } + if (stripe_opt->block_size % 512) { + gf_log (this->name, GF_LOG_ERROR, "Block-size: %s should" + " be a multiple of 512 bytes", num); + goto out; + } + + memcpy (stripe_opt->path_pattern, pattern, strlen (pattern)); + + gf_log (this->name, GF_LOG_DEBUG, + "block-size : pattern %s : size %"PRId64, + stripe_opt->path_pattern, stripe_opt->block_size); + + if (priv->pattern) + temp_stripeopt = NULL; + else + temp_stripeopt = priv->pattern; + + stripe_opt->next = temp_stripeopt; + + priv->pattern = stripe_opt; + stripe_opt = NULL; + + GF_FREE (dup_str); + dup_str = NULL; + + stripe_str = strtok_r (NULL, ",", &tmp_str); + } + + ret = 0; +out: + + GF_FREE (dup_str); + + GF_FREE (stripe_opt); + + return ret; +} + +int32_t +stripe_iatt_merge (struct iatt *from, struct iatt *to) +{ + if (to->ia_size < from->ia_size) + to->ia_size = from->ia_size; + if (to->ia_mtime < from->ia_mtime) + to->ia_mtime = from->ia_mtime; + if (to->ia_ctime < from->ia_ctime) + to->ia_ctime = from->ia_ctime; + if (to->ia_atime < from->ia_atime) + to->ia_atime = from->ia_atime; + return 0; +} + +off_t +coalesced_offset(off_t offset, uint64_t stripe_size, int stripe_count) +{ + size_t line_size = 0; + uint64_t stripe_num = 0; + off_t coalesced_offset = 0; + + line_size = stripe_size * stripe_count; + stripe_num = offset / line_size; + + coalesced_offset = (stripe_num * stripe_size) + + (offset % stripe_size); + + return coalesced_offset; +} + +off_t +uncoalesced_size(off_t size, uint64_t stripe_size, int stripe_count, + int stripe_index) +{ + uint64_t nr_full_stripe_chunks = 0, mod = 0; + + if (!size) + return size; + + /* + * Estimate the number of fully written stripes from the + * local file size. Each stripe_size chunk corresponds to + * a stripe. + */ + nr_full_stripe_chunks = (size / stripe_size) * stripe_count; + mod = size % stripe_size; + + if (!mod) { + /* + * There is no remainder, thus we could have overestimated + * the size of the file in terms of chunks. Trim the number + * of chunks by the following stripe members and leave it + * up to those nodes to respond with a larger size (if + * necessary). + */ + nr_full_stripe_chunks -= stripe_count - + (stripe_index + 1); + size = nr_full_stripe_chunks * stripe_size; + } else { + /* + * There is a remainder and thus we own the last chunk of the + * file. Add the preceding stripe members of the final stripe + * along with the remainder to calculate the exact size. + */ + nr_full_stripe_chunks += stripe_index; + size = nr_full_stripe_chunks * stripe_size + mod; + } + + return size; +} + diff --git a/xlators/cluster/stripe/src/stripe-mem-types.h b/xlators/cluster/stripe/src/stripe-mem-types.h index 29c95c257..e9ac9cf46 100644 --- a/xlators/cluster/stripe/src/stripe-mem-types.h +++ b/xlators/cluster/stripe/src/stripe-mem-types.h @@ -1,21 +1,11 @@ - /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -25,12 +15,12 @@ #include "mem-types.h" enum gf_stripe_mem_types_ { - gf_stripe_mt_stripe_local_t = gf_common_mt_end + 1, - gf_stripe_mt_iovec, - gf_stripe_mt_readv_replies, + gf_stripe_mt_iovec = gf_common_mt_end + 1, + gf_stripe_mt_stripe_replies, gf_stripe_mt_stripe_fd_ctx_t, gf_stripe_mt_char, gf_stripe_mt_int8_t, + gf_stripe_mt_int32_t, gf_stripe_mt_xlator_t, gf_stripe_mt_stripe_private_t, gf_stripe_mt_stripe_options, diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c index 9c41c405c..69b510e23 100644 --- a/xlators/cluster/stripe/src/stripe.c +++ b/xlators/cluster/stripe/src/stripe.c @@ -1,25 +1,16 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ /** * xlators/cluster/stripe: - * Stripe translator, stripes the data accross its child nodes, + * Stripe translator, stripes the data across its child nodes, * as per the options given in the volfile. The striping works * fairly simple. It writes files at different offset as per * calculation. So, 'ls -l' output at the real posix level will @@ -32,6 +23,7 @@ * very much necessary, or else, use it in combination with AFR, to have a * backup copy. */ +#include <fnmatch.h> #include "stripe.h" #include "libxlator.h" @@ -40,73 +32,10 @@ struct volume_options options[]; -void -stripe_local_wipe (stripe_local_t *local) -{ - if (!local) - goto out; - - loc_wipe (&local->loc); - loc_wipe (&local->loc2); - - if (local->fd) - fd_unref (local->fd); - - if (local->inode) - inode_unref (local->inode); - - if (local->xattr) - dict_unref (local->xattr); - - if (local->dict) - dict_unref (local->dict); - -out: - return; -} - -/** - * stripe_get_matching_bs - Get the matching block size for the given path. - */ -int32_t -stripe_get_matching_bs (const char *path, struct stripe_options *opts, - uint64_t default_bs) -{ - struct stripe_options *trav = NULL; - char *pathname = NULL; - uint64_t block_size = 0; - - block_size = default_bs; - - if (!path || !opts) - goto out; - - /* FIXME: is a strdup really necessary? */ - pathname = gf_strdup (path); - if (!pathname) - goto out; - - trav = opts; - while (trav) { - if (!fnmatch (trav->path_pattern, pathname, FNM_NOESCAPE)) { - block_size = trav->block_size; - break; - } - trav = trav->next; - } - - GF_FREE (pathname); - -out: - return block_size; -} - - - int32_t stripe_sh_chown_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) + struct iatt *preop, struct iatt *postop, dict_t *xdata) { int callcnt = -1; stripe_local_t *local = NULL; @@ -135,7 +64,7 @@ int32_t stripe_sh_make_entry_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { stripe_local_t *local = NULL; call_frame_t *prev = NULL; @@ -150,7 +79,7 @@ stripe_sh_make_entry_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STACK_WIND (frame, stripe_sh_chown_cbk, prev->this, prev->this->fops->setattr, &local->loc, - &local->stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID)); + &local->stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL); out: return 0; @@ -164,7 +93,7 @@ stripe_entry_self_heal (call_frame_t *frame, xlator_t *this, call_frame_t *rframe = NULL; stripe_local_t *rlocal = NULL; stripe_private_t *priv = NULL; - dict_t *dict = NULL; + dict_t *xdata = NULL; int ret = 0; if (!local || !this || !frame) { @@ -182,8 +111,7 @@ stripe_entry_self_heal (call_frame_t *frame, xlator_t *this, if (!rframe) { goto out; } - rlocal = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + rlocal = mem_get0 (this->local_pool); if (!rlocal) { goto out; } @@ -192,14 +120,14 @@ stripe_entry_self_heal (call_frame_t *frame, xlator_t *this, loc_copy (&rlocal->loc, &local->loc); memcpy (&rlocal->stbuf, &local->stbuf, sizeof (struct iatt)); - dict = dict_new (); - if (!dict) + xdata = dict_new (); + if (!xdata) goto out; - ret = dict_set_static_bin (dict, "gfid-req", local->stbuf.ia_gfid, 16); + ret = dict_set_static_bin (xdata, "gfid-req", local->stbuf.ia_gfid, 16); if (ret) gf_log (this->name, GF_LOG_WARNING, - "failed to set gfid-req"); + "%s: failed to set gfid-req", local->loc.path); while (trav) { if (IA_ISREG (local->stbuf.ia_type)) { @@ -207,101 +135,43 @@ stripe_entry_self_heal (call_frame_t *frame, xlator_t *this, trav->xlator, trav->xlator->fops->mknod, &local->loc, st_mode_from_ia (local->stbuf.ia_prot, - local->stbuf.ia_type), 0, - dict); + local->stbuf.ia_type), + 0, 0, xdata); } if (IA_ISDIR (local->stbuf.ia_type)) { STACK_WIND (rframe, stripe_sh_make_entry_cbk, trav->xlator, trav->xlator->fops->mkdir, - &local->loc, st_mode_from_ia (local->stbuf.ia_prot, - local->stbuf.ia_type), - dict); + &local->loc, + st_mode_from_ia (local->stbuf.ia_prot, + local->stbuf.ia_type), + 0, xdata); } trav = trav->next; } - if (dict) - dict_unref (dict); + if (xdata) + dict_unref (xdata); return 0; out: if (rframe) STRIPE_STACK_DESTROY (rframe); - if (dict) - dict_unref (dict); + if (xdata) + dict_unref (xdata); return 0; } -void -stripe_aggregate (dict_t *this, char *key, data_t *value, void *data) -{ - dict_t *dst = NULL; - int64_t *ptr = 0, *size = NULL; - int32_t ret = -1; - - dst = data; - - if (strcmp (key, GF_XATTR_QUOTA_SIZE_KEY) == 0) { - ret = dict_get_bin (dst, key, (void **)&size); - if (ret < 0) { - size = GF_CALLOC (1, sizeof (int64_t), - gf_common_mt_char); - if (size == NULL) { - gf_log ("stripe", GF_LOG_WARNING, - "memory allocation failed"); - goto out; - } - ret = dict_set_bin (dst, key, size, sizeof (int64_t)); - if (ret < 0) { - gf_log ("stripe", GF_LOG_WARNING, - "stripe aggregate dict set failed"); - GF_FREE (size); - goto out; - } - } - - ptr = data_to_bin (value); - if (ptr == NULL) { - gf_log ("stripe", GF_LOG_WARNING, "data to bin failed"); - goto out; - } - - *size = hton64 (ntoh64 (*size) + ntoh64 (*ptr)); - } else if (strcmp (key, GF_CONTENT_KEY)) { - /* No need to aggregate 'CONTENT' data */ - ret = dict_set (dst, key, value); - if (ret) - gf_log ("stripe", GF_LOG_WARNING, "xattr dict set failed"); - } - -out: - return; -} - - -void -stripe_aggregate_xattr (dict_t *dst, dict_t *src) -{ - if ((dst == NULL) || (src == NULL)) { - goto out; - } - - dict_foreach (src, stripe_aggregate, dst); -out: - return; -} - - int32_t stripe_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *dict, struct iatt *postparent) + struct iatt *buf, dict_t *xdata, struct iatt *postparent) { - int32_t callcnt = 0; - stripe_local_t *local = NULL; - call_frame_t *prev = NULL; + int32_t callcnt = 0; + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = 0; if (!this || !frame || !frame->local || !cookie) { gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); @@ -332,30 +202,42 @@ stripe_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret >= 0) { local->op_ret = 0; + if (IA_ISREG (buf->ia_type)) { + ret = stripe_ctx_handle (this, prev, local, + xdata); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Error getting fctx info from" + " dict"); + } if (FIRST_CHILD(this) == prev->this) { local->stbuf = *buf; local->postparent = *postparent; local->inode = inode_ref (inode); - local->dict = dict_ref (dict); + if (xdata) + local->xdata = dict_ref (xdata); if (local->xattr) { - stripe_aggregate_xattr (local->dict, + stripe_aggregate_xattr (local->xdata, local->xattr); dict_unref (local->xattr); local->xattr = NULL; } } - if (!local->dict && !local->xattr) { - local->xattr = dict_ref (dict); - } else if (local->dict) { - stripe_aggregate_xattr (local->dict, dict); + + if (!local->xdata && !local->xattr) { + local->xattr = dict_ref (xdata); + } else if (local->xdata) { + stripe_aggregate_xattr (local->xdata, xdata); } else if (local->xattr) { - stripe_aggregate_xattr (local->xattr, dict); + stripe_aggregate_xattr (local->xattr, xdata); } local->stbuf_blocks += buf->ia_blocks; local->postparent_blocks += postparent->ia_blocks; + correct_file_size(buf, local->fctx, prev); + if (local->stbuf_size < buf->ia_size) local->stbuf_size = buf->ia_size; if (local->postparent_size < postparent->ia_size) @@ -375,7 +257,8 @@ stripe_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, UNLOCK (&frame->lock); if (!callcnt) { - if (local->op_ret == 0 && local->entry_self_heal_needed) + if (local->op_ret == 0 && local->entry_self_heal_needed && + !uuid_is_null (local->loc.inode->gfid)) stripe_entry_self_heal (frame, this, local); if (local->failed) @@ -386,11 +269,13 @@ stripe_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->stbuf.ia_size = local->stbuf_size; local->postparent.ia_blocks = local->postparent_blocks; local->postparent.ia_size = local->postparent_size; + inode_ctx_put (local->inode, this, + (uint64_t) (long)local->fctx); } STRIPE_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, local->inode, - &local->stbuf, local->dict, + &local->stbuf, local->xdata, &local->postparent); } out: @@ -399,14 +284,15 @@ out: int32_t stripe_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *xattr_req) + dict_t *xdata) { - stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; - stripe_private_t *priv = NULL; + stripe_local_t *local = NULL; + xlator_list_t *trav = NULL; + stripe_private_t *priv = NULL; int32_t op_errno = EINVAL; int64_t filesize = 0; - int ret = 0; + int ret = 0; + uint64_t tmpctx = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -418,8 +304,7 @@ stripe_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, trav = this->children; /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -428,10 +313,37 @@ stripe_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, frame->local = local; loc_copy (&local->loc, loc); - if (xattr_req && dict_get (xattr_req, GF_CONTENT_KEY)) { - ret = dict_get_int64 (xattr_req, GF_CONTENT_KEY, &filesize); + inode_ctx_get (local->inode, this, &tmpctx); + if (tmpctx) + local->fctx = (stripe_fd_ctx_t*) (long)tmpctx; + + /* quick-read friendly changes */ + if (xdata && dict_get (xdata, GF_CONTENT_KEY)) { + ret = dict_get_int64 (xdata, GF_CONTENT_KEY, &filesize); if (!ret && (filesize > priv->block_size)) - dict_del (xattr_req, GF_CONTENT_KEY); + dict_del (xdata, GF_CONTENT_KEY); + } + + /* get stripe-size xattr on lookup. This would be required for + * open/read/write/pathinfo calls. Hence we send down the request + * even when type == IA_INVAL */ + + /* + * We aren't guaranteed to have xdata here. We need the format info for + * the file, so allocate xdata if necessary. + */ + if (!xdata) + xdata = dict_new(); + else + xdata = dict_ref(xdata); + + if (xdata && (IA_ISREG (loc->inode->ia_type) || + (loc->inode->ia_type == IA_INVAL))) { + ret = stripe_xattr_request_build (this, xdata, 8, 4, 4, 0); + if (ret) + gf_log (this->name , GF_LOG_ERROR, "Failed to build" + " xattr request for %s", loc->path); + } /* Everytime in stripe lookup, all child nodes @@ -439,11 +351,12 @@ stripe_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, local->call_count = priv->child_count; while (trav) { STACK_WIND (frame, stripe_lookup_cbk, trav->xlator, - trav->xlator->fops->lookup, - loc, xattr_req); + trav->xlator->fops->lookup, loc, xdata); trav = trav->next; } + dict_unref(xdata); + return 0; err: STRIPE_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); @@ -453,7 +366,7 @@ err: int32_t stripe_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf) + int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -488,6 +401,9 @@ stripe_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } local->stbuf_blocks += buf->ia_blocks; + + correct_file_size(buf, local->fctx, prev); + if (local->stbuf_size < buf->ia_size) local->stbuf_size = buf->ia_size; } @@ -504,18 +420,19 @@ stripe_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } STRIPE_STACK_UNWIND (stat, frame, local->op_ret, - local->op_errno, &local->stbuf); + local->op_errno, &local->stbuf, NULL); } out: return 0; } int32_t -stripe_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) +stripe_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { xlator_list_t *trav = NULL; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; + stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = EINVAL; VALIDATE_OR_GOTO (frame, err); @@ -533,8 +450,7 @@ stripe_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -543,23 +459,30 @@ stripe_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) frame->local = local; local->call_count = priv->child_count; + if (IA_ISREG(loc->inode->ia_type)) { + inode_ctx_get(loc->inode, this, (uint64_t *) &fctx); + if (!fctx) + goto err; + local->fctx = fctx; + } + while (trav) { STACK_WIND (frame, stripe_stat_cbk, trav->xlator, - trav->xlator->fops->stat, loc); + trav->xlator->fops->stat, loc, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (stat, frame, -1, op_errno, NULL); + STRIPE_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); return 0; } int32_t stripe_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct statvfs *stbuf) + int32_t op_ret, int32_t op_errno, struct statvfs *stbuf, dict_t *xdata) { stripe_local_t *local = NULL; int32_t callcnt = 0; @@ -597,14 +520,14 @@ stripe_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (!callcnt) { STRIPE_STACK_UNWIND (statfs, frame, local->op_ret, - local->op_errno, &local->statvfs_buf); + local->op_errno, &local->statvfs_buf, NULL); } out: return 0; } int32_t -stripe_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) +stripe_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { stripe_local_t *local = NULL; xlator_list_t *trav = NULL; @@ -619,8 +542,7 @@ stripe_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) priv = this->private; /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -632,13 +554,13 @@ stripe_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) local->call_count = priv->child_count; while (trav) { STACK_WIND (frame, stripe_statfs_cbk, trav->xlator, - trav->xlator->fops->statfs, loc); + trav->xlator->fops->statfs, loc, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (statfs, frame, -1, op_errno, NULL); + STRIPE_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); return 0; } @@ -647,7 +569,7 @@ err: int32_t stripe_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -685,6 +607,9 @@ stripe_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->prebuf_blocks += prebuf->ia_blocks; local->postbuf_blocks += postbuf->ia_blocks; + correct_file_size(prebuf, local->fctx, prev); + correct_file_size(postbuf, local->fctx, prev); + if (local->prebuf_size < prebuf->ia_size) local->prebuf_size = prebuf->ia_size; @@ -707,19 +632,21 @@ stripe_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STRIPE_STACK_UNWIND (truncate, frame, local->op_ret, local->op_errno, &local->pre_buf, - &local->post_buf); + &local->post_buf, NULL); } out: return 0; } int32_t -stripe_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) +stripe_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, dict_t *xdata) { - xlator_list_t *trav = NULL; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; + stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = EINVAL; + int i, eof_idx; + off_t dest_offset, tmp_offset; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -728,7 +655,6 @@ stripe_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) VALIDATE_OR_GOTO (loc->inode, err); priv = this->private; - trav = this->children; if (priv->first_child_down) { op_errno = ENOTCONN; @@ -736,8 +662,7 @@ stripe_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -746,15 +671,55 @@ stripe_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) frame->local = local; local->call_count = priv->child_count; - while (trav) { - STACK_WIND (frame, stripe_truncate_cbk, trav->xlator, - trav->xlator->fops->truncate, loc, offset); - trav = trav->next; - } + inode_ctx_get(loc->inode, this, (uint64_t *) &fctx); + if (!fctx) { + gf_log(this->name, GF_LOG_ERROR, "no stripe context"); + op_errno = EINVAL; + goto err; + } + + local->fctx = fctx; + eof_idx = (offset / fctx->stripe_size) % fctx->stripe_count; + + for (i = 0; i < fctx->stripe_count; i++) { + if (!fctx->xl_array[i]) { + gf_log(this->name, GF_LOG_ERROR, + "no xlator at index %d", i); + op_errno = EINVAL; + goto err; + } + + if (fctx->stripe_coalesce) { + /* + * The node that owns EOF is truncated to the exact + * coalesced offset. Nodes prior to this index should + * be rounded up to the size of the complete stripe, + * while nodes after this index should be rounded down + * to the size of the previous stripe. + */ + if (i < eof_idx) + tmp_offset = roof(offset, fctx->stripe_size * + fctx->stripe_count); + else if (i > eof_idx) + tmp_offset = floor(offset, fctx->stripe_size * + fctx->stripe_count); + else + tmp_offset = offset; + + dest_offset = coalesced_offset(tmp_offset, + fctx->stripe_size, fctx->stripe_count); + } else { + dest_offset = offset; + } + + STACK_WIND(frame, stripe_truncate_cbk, fctx->xl_array[i], + fctx->xl_array[i]->fops->truncate, loc, dest_offset, + NULL); + } return 0; err: - STRIPE_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL); + STRIPE_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -762,7 +727,7 @@ err: int32_t stripe_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) + struct iatt *preop, struct iatt *postop, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -801,6 +766,9 @@ stripe_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->prebuf_blocks += preop->ia_blocks; local->postbuf_blocks += postop->ia_blocks; + correct_file_size(preop, local->fctx, prev); + correct_file_size(postop, local->fctx, prev); + if (local->prebuf_size < preop->ia_size) local->prebuf_size = preop->ia_size; if (local->postbuf_size < postop->ia_size) @@ -822,7 +790,7 @@ stripe_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STRIPE_STACK_UNWIND (setattr, frame, local->op_ret, local->op_errno, &local->pre_buf, - &local->post_buf); + &local->post_buf, NULL); } out: return 0; @@ -831,11 +799,12 @@ out: int32_t stripe_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid) + struct iatt *stbuf, int32_t valid, dict_t *xdata) { xlator_list_t *trav = NULL; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; + stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = EINVAL; VALIDATE_OR_GOTO (frame, err); @@ -853,33 +822,47 @@ stripe_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } local->op_ret = -1; frame->local = local; - local->call_count = priv->child_count; + if (!IA_ISDIR (loc->inode->ia_type) && + !IA_ISREG (loc->inode->ia_type)) { + local->call_count = 1; + STACK_WIND (frame, stripe_setattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setattr, + loc, stbuf, valid, NULL); + return 0; + } + + if (IA_ISREG(loc->inode->ia_type)) { + inode_ctx_get(loc->inode, this, (uint64_t *) &fctx); + if (!fctx) + goto err; + local->fctx = fctx; + } + local->call_count = priv->child_count; while (trav) { STACK_WIND (frame, stripe_setattr_cbk, trav->xlator, trav->xlator->fops->setattr, - loc, stbuf, valid); + loc, stbuf, valid, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL); + STRIPE_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); return 0; } int32_t stripe_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid) + struct iatt *stbuf, int32_t valid, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; @@ -895,8 +878,7 @@ stripe_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, trav = this->children; /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -907,13 +889,13 @@ stripe_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, while (trav) { STACK_WIND (frame, stripe_setattr_cbk, trav->xlator, - trav->xlator->fops->fsetattr, fd, stbuf, valid); + trav->xlator->fops->fsetattr, fd, stbuf, valid, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL); + STRIPE_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -921,7 +903,8 @@ int32_t stripe_stack_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -958,6 +941,8 @@ stripe_stack_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->pre_buf.ia_blocks += prenewparent->ia_blocks; local->post_buf.ia_blocks += postnewparent->ia_blocks; + correct_file_size(buf, local->fctx, prev); + if (local->stbuf.ia_size < buf->ia_size) local->stbuf.ia_size = buf->ia_size; @@ -983,7 +968,7 @@ stripe_stack_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STRIPE_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, &local->stbuf, &local->preparent, &local->postparent, &local->pre_buf, - &local->post_buf); + &local->post_buf, NULL); } out: return 0; @@ -993,7 +978,8 @@ int32_t stripe_first_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { stripe_local_t *local = NULL; xlator_list_t *trav = NULL; @@ -1024,24 +1010,25 @@ stripe_first_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, while (trav) { STACK_WIND (frame, stripe_stack_rename_cbk, trav->xlator, trav->xlator->fops->rename, - &local->loc, &local->loc2); + &local->loc, &local->loc2, NULL); trav = trav->next; } return 0; unwind: STRIPE_STACK_UNWIND (rename, frame, -1, op_errno, buf, preoldparent, - postoldparent, prenewparent, postnewparent); + postoldparent, prenewparent, postnewparent, NULL); return 0; } int32_t stripe_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc) + loc_t *newloc, dict_t *xdata) { stripe_private_t *priv = NULL; stripe_local_t *local = NULL; xlator_list_t *trav = NULL; + stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = EINVAL; VALIDATE_OR_GOTO (frame, err); @@ -1061,8 +1048,7 @@ stripe_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -1073,24 +1059,67 @@ stripe_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, local->call_count = priv->child_count; + if (IA_ISREG(oldloc->inode->ia_type)) { + inode_ctx_get(oldloc->inode, this, (uint64_t *) &fctx); + if (!fctx) + goto err; + local->fctx = fctx; + } + frame->local = local; STACK_WIND (frame, stripe_first_rename_cbk, trav->xlator, - trav->xlator->fops->rename, oldloc, newloc); + trav->xlator->fops->rename, oldloc, newloc, NULL); return 0; err: STRIPE_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, - NULL, NULL); + NULL, NULL, NULL); + return 0; +} +int32_t +stripe_first_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, "%s returned %s", + prev->this->name, strerror (op_errno)); + goto out; + } + local->op_ret = 0; + local->preparent = *preparent; + local->postparent = *postparent; + local->preparent_blocks += preparent->ia_blocks; + local->postparent_blocks += postparent->ia_blocks; + + STRIPE_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, xdata); + return 0; +out: + STRIPE_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } + int32_t stripe_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -1112,49 +1141,33 @@ stripe_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (this->name, GF_LOG_DEBUG, "%s returned %s", prev->this->name, strerror (op_errno)); local->op_errno = op_errno; - if ((op_errno != ENOENT) || - (prev->this == FIRST_CHILD (this))) + if (op_errno != ENOENT) { local->failed = 1; - } - if (op_ret >= 0) { - local->op_ret = op_ret; - if (FIRST_CHILD(this) == prev->this) { - local->preparent = *preparent; - local->postparent = *postparent; + local->op_ret = op_ret; } - local->preparent_blocks += preparent->ia_blocks; - local->postparent_blocks += postparent->ia_blocks; - - if (local->preparent_size < preparent->ia_size) - local->preparent_size = preparent->ia_size; - - if (local->postparent_size < postparent->ia_size) - local->postparent_size = postparent->ia_size; } } UNLOCK (&frame->lock); - if (!callcnt) { - if (local->failed) - local->op_ret = -1; - - if (local->op_ret != -1) { - local->preparent.ia_blocks = local->preparent_blocks; - local->preparent.ia_size = local->preparent_size; - local->postparent.ia_blocks = local->postparent_blocks; - local->postparent.ia_size = local->postparent_size; + if (callcnt == 1) { + if (local->failed) { + op_errno = local->op_errno; + goto out; } - - STRIPE_STACK_UNWIND (unlink, frame, local->op_ret, - local->op_errno, &local->preparent, - &local->postparent); + STACK_WIND(frame, stripe_first_unlink_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->unlink, &local->loc, + local->xflag, local->xdata); } + return 0; out: + STRIPE_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } int32_t -stripe_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) +stripe_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, + int xflag, dict_t *xdata) { xlator_list_t *trav = NULL; stripe_local_t *local = NULL; @@ -1182,26 +1195,32 @@ stripe_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } local->op_ret = -1; + loc_copy (&local->loc, loc); + local->xflag = xflag; + + if (xdata) + local->xdata = dict_ref (xdata); + frame->local = local; local->call_count = priv->child_count; + trav = trav->next; /* Skip the first child */ while (trav) { STACK_WIND (frame, stripe_unlink_cbk, trav->xlator, trav->xlator->fops->unlink, - loc); + loc, xflag, xdata); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL); + STRIPE_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -1209,10 +1228,8 @@ err: int32_t stripe_first_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno,struct iatt *preparent, - struct iatt *postparent) - + struct iatt *postparent, dict_t *xdata) { - xlator_list_t *trav = NULL; stripe_local_t *local = NULL; if (!this || !frame || !frame->local) { @@ -1225,11 +1242,10 @@ stripe_first_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto err; } - trav = this->children; local = frame->local; + local->op_ret = 0; local->call_count--; /* First child successful */ - trav = trav->next; /* Skip first child */ local->preparent = *preparent; local->postparent = *postparent; @@ -1238,22 +1254,60 @@ stripe_first_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->preparent_blocks += preparent->ia_blocks; local->postparent_blocks += postparent->ia_blocks; - while (trav) { - STACK_WIND (frame, stripe_unlink_cbk, trav->xlator, - trav->xlator->fops->rmdir, &local->loc, - local->flags); - trav = trav->next; - } - + STRIPE_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, xdata); return 0; err: - STRIPE_STACK_UNWIND (rmdir, frame, op_ret, op_errno, NULL, NULL); + STRIPE_STACK_UNWIND (rmdir, frame, op_ret, op_errno, NULL, NULL, NULL); return 0; } int32_t -stripe_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) +stripe_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int32_t callcnt = 0; + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, "%s returned %s", + prev->this->name, strerror (op_errno)); + if (op_errno != ENOENT) + local->failed = 1; + } + } + UNLOCK (&frame->lock); + + if (callcnt == 1) { + if (local->failed) + goto out; + STACK_WIND (frame, stripe_first_rmdir_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->rmdir, &local->loc, + local->flags, NULL); + } + return 0; +out: + STRIPE_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +stripe_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, dict_t *xdata) { xlator_list_t *trav = NULL; stripe_local_t *local = NULL; @@ -1276,8 +1330,7 @@ stripe_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -1287,13 +1340,17 @@ stripe_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) loc_copy (&local->loc, loc); local->flags = flags; local->call_count = priv->child_count; + trav = trav->next; /* skip the first child */ - STACK_WIND (frame, stripe_first_rmdir_cbk, trav->xlator, - trav->xlator->fops->rmdir, loc, flags); + while (trav) { + STACK_WIND (frame, stripe_rmdir_cbk, trav->xlator, + trav->xlator->fops->rmdir, loc, flags, NULL); + trav = trav->next; + } return 0; err: - STRIPE_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL); + STRIPE_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -1302,7 +1359,7 @@ int32_t stripe_mknod_ifreg_fail_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -1323,7 +1380,7 @@ stripe_mknod_ifreg_fail_unlink_cbk (call_frame_t *frame, void *cookie, if (!callcnt) { STRIPE_STACK_UNWIND (mknod, frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, NULL); } out: return 0; @@ -1335,7 +1392,7 @@ out: int32_t stripe_mknod_ifreg_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) + int32_t op_errno, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -1374,7 +1431,7 @@ stripe_mknod_ifreg_setxattr_cbk (call_frame_t *frame, void *cookie, stripe_mknod_ifreg_fail_unlink_cbk, trav->xlator, trav->xlator->fops->unlink, - &local->loc); + &local->loc, 0, NULL); trav = trav->next; } return 0; @@ -1382,7 +1439,7 @@ stripe_mknod_ifreg_setxattr_cbk (call_frame_t *frame, void *cookie, STRIPE_STACK_UNWIND (mknod, frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, NULL); } out: return 0; @@ -1392,7 +1449,7 @@ int32_t stripe_mknod_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -1406,7 +1463,7 @@ stripe_mknod_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } prev = cookie; - priv = this->private; + priv = this->private; local = frame->local; LOCK (&frame->lock); @@ -1422,25 +1479,24 @@ stripe_mknod_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->failed = 1; local->op_errno = op_errno; } - if (op_ret >= 0) { local->op_ret = op_ret; - if (FIRST_CHILD(this) == prev->this) { - local->stbuf = *buf; - local->preparent = *preparent; - local->postparent = *postparent; - } - /* Can be used as a mechanism to understand if mknod was successful in at least one place */ if (uuid_is_null (local->ia_gfid)) uuid_copy (local->ia_gfid, buf->ia_gfid); + if (stripe_ctx_handle(this, prev, local, xdata)) + gf_log(this->name, GF_LOG_ERROR, + "Error getting fctx info from dict"); + local->stbuf_blocks += buf->ia_blocks; local->preparent_blocks += preparent->ia_blocks; local->postparent_blocks += postparent->ia_blocks; + correct_file_size(buf, local->fctx, prev); + if (local->stbuf_size < buf->ia_size) local->stbuf_size = buf->ia_size; if (local->preparent_size < preparent->ia_size) @@ -1465,7 +1521,7 @@ stripe_mknod_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, stripe_mknod_ifreg_fail_unlink_cbk, trav->xlator, trav->xlator->fops->unlink, - &local->loc); + &local->loc, 0, NULL); trav = trav->next; } return 0; @@ -1479,13 +1535,13 @@ stripe_mknod_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->postparent.ia_size = local->postparent_size; local->stbuf.ia_size = local->stbuf_size; local->stbuf.ia_blocks = local->stbuf_blocks; - } + inode_ctx_put (local->inode, this, + (uint64_t)(long) local->fctx); - /* Create itself has failed.. so return - without setxattring */ + } STRIPE_STACK_UNWIND (mknod, frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, NULL); } out: return 0; @@ -1493,29 +1549,118 @@ out: int32_t +stripe_mknod_first_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = NULL; + call_frame_t *prev = NULL; + xlator_list_t *trav = NULL; + int i = 1; + dict_t *dict = NULL; + int ret = 0; + int need_unref = 0; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + priv = this->private; + local = frame->local; + trav = this->children; + + local->call_count--; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, "%s returned error %s", + prev->this->name, strerror (op_errno)); + local->failed = 1; + local->op_errno = op_errno; + goto out; + } + + local->op_ret = op_ret; + + local->stbuf = *buf; + local->preparent = *preparent; + local->postparent = *postparent; + + if (uuid_is_null (local->ia_gfid)) + uuid_copy (local->ia_gfid, buf->ia_gfid); + local->preparent.ia_blocks = local->preparent_blocks; + local->preparent.ia_size = local->preparent_size; + local->postparent.ia_blocks = local->postparent_blocks; + local->postparent.ia_size = local->postparent_size; + local->stbuf.ia_size = local->stbuf_size; + local->stbuf.ia_blocks = local->stbuf_blocks; + + trav = trav->next; + while (trav) { + if (priv->xattr_supported) { + dict = dict_new (); + if (!dict) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate dict %s", local->loc.path); + } + need_unref = 1; + + dict_copy (local->xattr, dict); + + ret = stripe_xattr_request_build (this, dict, + local->stripe_size, + priv->child_count, i, + priv->coalesce); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Failed to build xattr request"); + + } else { + dict = local->xattr; + } + + STACK_WIND (frame, stripe_mknod_ifreg_cbk, + trav->xlator, trav->xlator->fops->mknod, + &local->loc, local->mode, local->rdev, 0, dict); + trav = trav->next; + i++; + + if (dict && need_unref) + dict_unref (dict); + } + + return 0; + +out: + + STRIPE_STACK_UNWIND (mknod, frame, op_ret, op_errno, NULL, NULL, NULL, NULL, NULL); + return 0; +} + + +int32_t stripe_single_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { STRIPE_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, buf, - preparent, postparent); + preparent, postparent, xdata); return 0; } int stripe_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t rdev, dict_t *params) + dev_t rdev, mode_t umask, dict_t *xdata) { stripe_private_t *priv = NULL; stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; int32_t op_errno = EINVAL; int32_t i = 0; - char size_key[256] = {0,}; - char index_key[256] = {0,}; - char count_key[256] = {0,}; dict_t *dict = NULL; int ret = 0; int need_unref = 0; @@ -1527,7 +1672,6 @@ stripe_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, VALIDATE_OR_GOTO (loc->inode, err); priv = this->private; - trav = this->children; if (priv->first_child_down) { op_errno = ENOTCONN; @@ -1546,86 +1690,63 @@ stripe_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } local->op_ret = -1; local->op_errno = ENOTCONN; - local->stripe_size = stripe_get_matching_bs (loc->path, - priv->pattern, - priv->block_size); + local->stripe_size = stripe_get_matching_bs (loc->path, priv); frame->local = local; local->inode = inode_ref (loc->inode); loc_copy (&local->loc, loc); + local->xattr = dict_copy_with_ref (xdata, NULL); + local->mode = mode; + local->umask = umask; + local->rdev = rdev; /* Everytime in stripe lookup, all child nodes should be looked up */ local->call_count = priv->child_count; - /* Send a setxattr request to nodes where the - files are created */ - sprintf (size_key, - "trusted.%s.stripe-size", this->name); - sprintf (count_key, - "trusted.%s.stripe-count", this->name); - sprintf (index_key, - "trusted.%s.stripe-index", this->name); - - while (trav) { - if (priv->xattr_supported) { - dict = dict_new (); - if (!dict) { - gf_log (this->name, GF_LOG_ERROR, - "failed to allocate dict %s", loc->path); - } - need_unref = 1; + if (priv->xattr_supported) { + dict = dict_new (); + if (!dict) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate dict %s", loc->path); + } + need_unref = 1; - dict_copy (params, dict); + dict_copy (xdata, dict); - ret = dict_set_int64 (dict, size_key, - local->stripe_size); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "%s: set stripe-size failed", loc->path); - ret = dict_set_int32 (dict, count_key, - priv->child_count); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "%s: set child_count failed", - loc->path); - ret = dict_set_int32 (dict, index_key, i); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "%s: set stripe-index failed", - loc->path); - } else { - dict = params; - } + ret = stripe_xattr_request_build (this, dict, + local->stripe_size, + priv->child_count, + i, priv->coalesce); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "failed to build xattr request"); + } else { + dict = xdata; + } - STACK_WIND (frame, stripe_mknod_ifreg_cbk, - trav->xlator, trav->xlator->fops->mknod, - loc, mode, rdev, dict); - trav = trav->next; - i++; + STACK_WIND (frame, stripe_mknod_first_ifreg_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->mknod, + loc, mode, rdev, umask, dict); if (dict && need_unref) dict_unref (dict); - } - - /* This case is handled, no need to continue further. */ return 0; } STACK_WIND (frame, stripe_single_mknod_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, - loc, mode, rdev, params); + loc, mode, rdev, umask, xdata); return 0; err: - STRIPE_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL); + STRIPE_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); return 0; } @@ -1634,7 +1755,7 @@ int32_t stripe_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -1665,12 +1786,6 @@ stripe_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret >= 0) { local->op_ret = 0; - if (FIRST_CHILD(this) == prev->this) { - local->inode = inode_ref (inode); - local->stbuf = *buf; - local->postparent = *postparent; - local->preparent = *preparent; - } local->stbuf_blocks += buf->ia_blocks; local->preparent_blocks += preparent->ia_blocks; local->postparent_blocks += postparent->ia_blocks; @@ -1686,10 +1801,7 @@ stripe_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, UNLOCK (&frame->lock); if (!callcnt) { - if (local->failed) - local->op_ret = -1; - - if (local->op_ret != -1) { + if (local->failed != -1) { local->preparent.ia_blocks = local->preparent_blocks; local->preparent.ia_size = local->preparent_size; local->postparent.ia_blocks = local->postparent_blocks; @@ -1700,16 +1812,76 @@ stripe_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STRIPE_STACK_UNWIND (mkdir, frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, &local->preparent, - &local->postparent); + &local->postparent, NULL); + } +out: + return 0; +} + + +int32_t +stripe_first_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; + xlator_list_t *trav = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + trav = this->children; + + local->call_count--; /* first child is successful */ + trav = trav->next; /* skip first child */ + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, "%s returned error %s", + prev->this->name, strerror (op_errno)); + local->op_errno = op_errno; + goto out; + } + + local->op_ret = 0; + + local->inode = inode_ref (inode); + local->stbuf = *buf; + local->postparent = *postparent; + local->preparent = *preparent; + + local->stbuf_blocks += buf->ia_blocks; + local->preparent_blocks += preparent->ia_blocks; + local->postparent_blocks += postparent->ia_blocks; + + local->stbuf_size = buf->ia_size; + local->preparent_size = preparent->ia_size; + local->postparent_size = postparent->ia_size; + + while (trav) { + STACK_WIND (frame, stripe_mkdir_cbk, trav->xlator, + trav->xlator->fops->mkdir, &local->loc, local->mode, + local->umask, local->xdata); + trav = trav->next; } + return 0; out: + STRIPE_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL); + return 0; + } int stripe_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dict_t *params) + mode_t umask, dict_t *xdata) { stripe_private_t *priv = NULL; stripe_local_t *local = NULL; @@ -1731,27 +1903,27 @@ stripe_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } local->op_ret = -1; local->call_count = priv->child_count; + if (xdata) + local->xdata = dict_ref (xdata); + local->mode = mode; + local->umask = umask; + loc_copy (&local->loc, loc); frame->local = local; /* Everytime in stripe lookup, all child nodes should be looked up */ - while (trav) { - STACK_WIND (frame, stripe_mkdir_cbk, - trav->xlator, trav->xlator->fops->mkdir, - loc, mode, params); - trav = trav->next; - } + STACK_WIND (frame, stripe_first_mkdir_cbk, trav->xlator, + trav->xlator->fops->mkdir, loc, mode, umask, xdata); return 0; err: - STRIPE_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL); + STRIPE_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); return 0; } @@ -1760,11 +1932,12 @@ int32_t stripe_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; call_frame_t *prev = NULL; + stripe_fd_ctx_t *fctx = NULL; if (!this || !frame || !frame->local || !cookie) { gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); @@ -1791,6 +1964,16 @@ stripe_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret >= 0) { local->op_ret = 0; + if (IA_ISREG(inode->ia_type)) { + inode_ctx_get(inode, this, (uint64_t *) &fctx); + if (!fctx) { + gf_log(this->name, GF_LOG_ERROR, + "failed to get stripe context"); + op_ret = -1; + op_errno = EINVAL; + } + } + if (FIRST_CHILD(this) == prev->this) { local->inode = inode_ref (inode); local->stbuf = *buf; @@ -1801,6 +1984,8 @@ stripe_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->preparent_blocks += preparent->ia_blocks; local->postparent_blocks += postparent->ia_blocks; + correct_file_size(buf, fctx, prev); + if (local->stbuf_size < buf->ia_size) local->stbuf_size = buf->ia_size; if (local->preparent_size < preparent->ia_size) @@ -1826,14 +2011,14 @@ stripe_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STRIPE_STACK_UNWIND (link, frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, &local->preparent, - &local->postparent); + &local->postparent, NULL); } out: return 0; } int32_t -stripe_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc) +stripe_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, dict_t *xdata) { xlator_list_t *trav = NULL; stripe_local_t *local = NULL; @@ -1856,8 +2041,7 @@ stripe_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc) } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -1871,13 +2055,13 @@ stripe_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc) while (trav) { STACK_WIND (frame, stripe_link_cbk, trav->xlator, trav->xlator->fops->link, - oldloc, newloc); + oldloc, newloc, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL); + STRIPE_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); return 0; } @@ -1885,7 +2069,7 @@ int32_t stripe_create_fail_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -1906,7 +2090,7 @@ stripe_create_fail_unlink_cbk (call_frame_t *frame, void *cookie, if (!callcnt) { STRIPE_STACK_UNWIND (create, frame, local->op_ret, local->op_errno, local->fd, local->inode, &local->stbuf, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, NULL); } out: return 0; @@ -1917,12 +2101,11 @@ int32_t stripe_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; - stripe_fd_ctx_t *fctx = NULL; call_frame_t *prev = NULL; xlator_list_t *trav = NULL; @@ -1943,26 +2126,26 @@ stripe_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (this->name, GF_LOG_DEBUG, "%s returned error %s", prev->this->name, strerror (op_errno)); - if ((op_errno != ENOENT) || - (prev->this == FIRST_CHILD (this))) - local->failed = 1; + local->failed = 1; local->op_errno = op_errno; } if (op_ret >= 0) { + if (IA_ISREG(buf->ia_type)) { + if (stripe_ctx_handle(this, prev, local, xdata)) + gf_log(this->name, GF_LOG_ERROR, + "Error getting fctx info from " + "dict"); + } + local->op_ret = op_ret; - /* Get the mapping in inode private */ - /* Get the stat buf right */ - if (FIRST_CHILD(this) == prev->this) { - local->stbuf = *buf; - local->preparent = *preparent; - local->postparent = *postparent; - } local->stbuf_blocks += buf->ia_blocks; local->preparent_blocks += preparent->ia_blocks; local->postparent_blocks += postparent->ia_blocks; + correct_file_size(buf, local->fctx, prev); + if (local->stbuf_size < buf->ia_size) local->stbuf_size = buf->ia_size; if (local->preparent_size < preparent->ia_size) @@ -1985,7 +2168,7 @@ stripe_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, stripe_create_fail_unlink_cbk, trav->xlator, trav->xlator->fops->unlink, - &local->loc); + &local->loc, 0, NULL); trav = trav->next; } @@ -2000,29 +2183,134 @@ stripe_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->stbuf.ia_size = local->stbuf_size; local->stbuf.ia_blocks = local->stbuf_blocks; - fctx = GF_CALLOC (1, sizeof (stripe_fd_ctx_t), - gf_stripe_mt_stripe_fd_ctx_t); - if (!fctx) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unwind; - } - - fctx->stripe_size = local->stripe_size; - fctx->stripe_count = priv->child_count; - fctx->static_array = 1; - fctx->xl_array = priv->xl_array; - fd_ctx_set (local->fd, this, - (uint64_t)(long)fctx); + stripe_copy_xl_array(local->fctx->xl_array, + priv->xl_array, + local->fctx->stripe_count); + inode_ctx_put(local->inode, this, + (uint64_t) local->fctx); } - unwind: /* Create itself has failed.. so return without setxattring */ STRIPE_STACK_UNWIND (create, frame, local->op_ret, local->op_errno, local->fd, local->inode, &local->stbuf, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, NULL); + } + +out: + return 0; +} + + + +int32_t +stripe_first_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, + inode_t *inode, struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = NULL; + call_frame_t *prev = NULL; + xlator_list_t *trav = NULL; + int i = 1; + dict_t *dict = NULL; + loc_t *loc = NULL; + int32_t need_unref = 0; + int32_t ret = -1; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + priv = this->private; + local = frame->local; + trav = this->children; + loc = &local->loc; + + --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, "%s returned error %s", + prev->this->name, strerror (op_errno)); + local->failed = 1; + local->op_errno = op_errno; + } + + local->op_ret = 0; + /* Get the mapping in inode private */ + /* Get the stat buf right */ + local->stbuf = *buf; + local->preparent = *preparent; + local->postparent = *postparent; + + local->stbuf_blocks += buf->ia_blocks; + local->preparent_blocks += preparent->ia_blocks; + local->postparent_blocks += postparent->ia_blocks; + + if (local->stbuf_size < buf->ia_size) + local->stbuf_size = buf->ia_size; + if (local->preparent_size < preparent->ia_size) + local->preparent_size = preparent->ia_size; + if (local->postparent_size < postparent->ia_size) + local->postparent_size = postparent->ia_size; + + if (local->failed) + local->op_ret = -1; + + if (local->op_ret == -1) { + local->call_count = 1; + STACK_WIND (frame, stripe_create_fail_unlink_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->unlink, + &local->loc, 0, NULL); + return 0; + } + + if (local->op_ret >= 0) { + local->preparent.ia_blocks = local->preparent_blocks; + local->preparent.ia_size = local->preparent_size; + local->postparent.ia_blocks = local->postparent_blocks; + local->postparent.ia_size = local->postparent_size; + local->stbuf.ia_size = local->stbuf_size; + local->stbuf.ia_blocks = local->stbuf_blocks; + } + + /* Send a setxattr request to nodes where the + files are created */ + trav = trav->next; + while (trav) { + if (priv->xattr_supported) { + dict = dict_new (); + if (!dict) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate dict %s", loc->path); + } + need_unref = 1; + + dict_copy (local->xattr, dict); + + ret = stripe_xattr_request_build (this, dict, + local->stripe_size, + priv->child_count, + i, priv->coalesce); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "failed to build xattr request"); + } else { + dict = local->xattr; + } + + STACK_WIND (frame, stripe_create_cbk, trav->xlator, + trav->xlator->fops->create, &local->loc, + local->flags, local->mode, local->umask, local->fd, + dict); + trav = trav->next; + if (need_unref && dict) + dict_unref (dict); + i++; } out: @@ -2030,6 +2318,7 @@ out: } + /** * stripe_create - If a block-size is specified for the 'name', create the * file in all the child nodes. If not, create it in only first child. @@ -2038,18 +2327,14 @@ out: */ int32_t stripe_create (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flags, mode_t mode, fd_t *fd, dict_t *params) + int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { stripe_private_t *priv = NULL; stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; int32_t op_errno = EINVAL; int ret = 0; int need_unref = 0; int i = 0; - char size_key[256] = {0,}; - char index_key[256] = {0,}; - char count_key[256] = {0,}; dict_t *dict = NULL; VALIDATE_OR_GOTO (frame, err); @@ -2071,82 +2356,68 @@ stripe_create (call_frame_t *frame, xlator_t *this, loc_t *loc, } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } local->op_ret = -1; local->op_errno = ENOTCONN; - local->stripe_size = stripe_get_matching_bs (loc->path, - priv->pattern, - priv->block_size); + local->stripe_size = stripe_get_matching_bs (loc->path, priv); frame->local = local; local->inode = inode_ref (loc->inode); loc_copy (&local->loc, loc); local->fd = fd_ref (fd); + local->flags = flags; + local->mode = mode; + local->umask = umask; + if (xdata) + local->xattr = dict_ref (xdata); local->call_count = priv->child_count; /* Send a setxattr request to nodes where the files are created */ - sprintf (size_key, "trusted.%s.stripe-size", this->name); - sprintf (count_key, "trusted.%s.stripe-count", this->name); - sprintf (index_key, "trusted.%s.stripe-index", this->name); - trav = this->children; - while (trav) { - if (priv->xattr_supported) { - dict = dict_new (); - if (!dict) { - gf_log (this->name, GF_LOG_ERROR, - "failed to allocate dict %s", loc->path); - } - need_unref = 1; + if (priv->xattr_supported) { + dict = dict_new (); + if (!dict) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate dict %s", loc->path); + } + need_unref = 1; - dict_copy (params, dict); + dict_copy (xdata, dict); - ret = dict_set_int64 (dict, size_key, - local->stripe_size); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "%s: set stripe-size failed", loc->path); - ret = dict_set_int32 (dict, count_key, - priv->child_count); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "%s: set child_count failed", - loc->path); - ret = dict_set_int32 (dict, index_key, i); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "%s: set stripe-index failed", - loc->path); - } else { - dict = params; - } + ret = stripe_xattr_request_build (this, dict, + local->stripe_size, + priv->child_count, + i, priv->coalesce); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "failed to build xattr request"); + } else { + dict = xdata; + } - STACK_WIND (frame, stripe_create_cbk, trav->xlator, - trav->xlator->fops->create, loc, flags, - mode, fd, dict); - trav = trav->next; - i++; - if (need_unref && dict) - dict_unref (dict); + STACK_WIND (frame, stripe_first_create_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->create, loc, flags, mode, + umask, fd, dict); + + if (need_unref && dict) + dict_unref (dict); - } return 0; err: STRIPE_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, - NULL, NULL); + NULL, NULL, xdata); return 0; } int32_t stripe_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -2184,224 +2455,25 @@ stripe_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (local->failed) local->op_ret = -1; - if (local->op_ret == -1) { - if (local->fctx) { - if (!local->fctx->static_array) - GF_FREE (local->fctx->xl_array); - GF_FREE (local->fctx); - } - } else { - fd_ctx_set (local->fd, this, - (uint64_t)(long)local->fctx); - } - STRIPE_STACK_UNWIND (open, frame, local->op_ret, - local->op_errno, local->fd); + local->op_errno, local->fd, xdata); } out: return 0; } -int32_t -stripe_open_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *dict, struct iatt *postparent) -{ - int32_t index = 0; - int32_t callcnt = 0; - char key[256] = {0,}; - stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; - stripe_private_t *priv = NULL; - data_t *data = NULL; - call_frame_t *prev = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = (call_frame_t *)cookie; - priv = this->private; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - local->op_ret = -1; - if (local->op_errno != EIO) - local->op_errno = op_errno; - if ((op_errno != ENOENT) || - (prev->this == FIRST_CHILD (this))) - local->failed = 1; - goto unlock; - } - - if (!dict) - goto unlock; - - if (!local->fctx) { - local->fctx = GF_CALLOC (1, sizeof (stripe_fd_ctx_t), - gf_stripe_mt_stripe_fd_ctx_t); - if (!local->fctx) { - local->op_errno = ENOMEM; - local->op_ret = -1; - goto unlock; - } - - local->fctx->static_array = 0; - } - /* Stripe block size */ - sprintf (key, "trusted.%s.stripe-size", this->name); - data = dict_get (dict, key); - if (!data) { - local->xattr_self_heal_needed = 1; - } else { - if (!local->fctx->stripe_size) { - local->fctx->stripe_size = - data_to_int64 (data); - } - - if (local->fctx->stripe_size != data_to_int64 (data)) { - gf_log (this->name, GF_LOG_WARNING, - "stripe-size mismatch in blocks"); - local->xattr_self_heal_needed = 1; - } - } - /* Stripe count */ - sprintf (key, "trusted.%s.stripe-count", this->name); - data = dict_get (dict, key); - if (!data) { - local->xattr_self_heal_needed = 1; - goto unlock; - } - if (!local->fctx->xl_array) { - local->fctx->stripe_count = data_to_int32 (data); - if (!local->fctx->stripe_count) { - gf_log (this->name, GF_LOG_ERROR, - "error with stripe-count xattr"); - local->op_ret = -1; - local->op_errno = EIO; - goto unlock; - } - - local->fctx->xl_array = - GF_CALLOC (local->fctx->stripe_count, - sizeof (xlator_t *), - gf_stripe_mt_xlator_t); - if (!local->fctx->xl_array) { - local->op_errno = ENOMEM; - local->op_ret = -1; - goto unlock; - } - } - if (local->fctx->stripe_count != data_to_int32 (data)) { - gf_log (this->name, GF_LOG_ERROR, - "error with stripe-count xattr (%d != %d)", - local->fctx->stripe_count, data_to_int32 (data)); - local->op_ret = -1; - local->op_errno = EIO; - goto unlock; - } - - /* index */ - sprintf (key, "trusted.%s.stripe-index", this->name); - data = dict_get (dict, key); - if (!data) { - local->xattr_self_heal_needed = 1; - goto unlock; - } - index = data_to_int32 (data); - if (index > priv->child_count) { - gf_log (this->name, GF_LOG_ERROR, - "error with stripe-index xattr (%d)", index); - local->op_ret = -1; - local->op_errno = EIO; - goto unlock; - } - if (local->fctx->xl_array) { - if (local->fctx->xl_array[index]) { - gf_log (this->name, GF_LOG_ERROR, - "duplicate entry @ index (%d)", index); - local->op_ret = -1; - local->op_errno = EIO; - goto unlock; - } - local->fctx->xl_array[index] = prev->this; - } - local->entry_count++; - local->op_ret = 0; - } -unlock: - UNLOCK (&frame->lock); - - if (!callcnt) { - /* TODO: if self-heal flag is set, do it */ - if (local->xattr_self_heal_needed) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: stripe info need to be healed", - local->loc.path); - } - - if (local->failed) - local->op_ret = -1; - - if (local->op_ret) - goto err; - - if (local->entry_count != local->fctx->stripe_count) { - gf_log (this->name, GF_LOG_ERROR, - "entry-count (%d) != stripe-count (%d)", - local->entry_count, local->fctx->stripe_count); - local->op_ret = -1; - local->op_errno = EIO; - goto err; - } - if (!local->fctx->stripe_size) { - gf_log (this->name, GF_LOG_ERROR, "stripe size not set"); - local->op_ret = -1; - local->op_errno = EIO; - goto err; - } - - local->call_count = local->fctx->stripe_count; - - trav = this->children; - while (trav) { - STACK_WIND (frame, stripe_open_cbk, trav->xlator, - trav->xlator->fops->open, &local->loc, - local->flags, local->fd, 0); - trav = trav->next; - } - } - - return 0; -err: - STRIPE_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, - local->fd); -out: - return 0; -} /** * stripe_open - */ int32_t stripe_open (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flags, fd_t *fd, int32_t wbflags) + int32_t flags, fd_t *fd, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; xlator_list_t *trav = NULL; int32_t op_errno = 1; - dict_t *dict = NULL; - int ret = 0; - char key[256] = {0,}; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -2418,8 +2490,7 @@ stripe_open (call_frame_t *frame, xlator_t *this, loc_t *loc, } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -2435,73 +2506,25 @@ stripe_open (call_frame_t *frame, xlator_t *this, loc_t *loc, /* Striped files */ local->flags = flags; local->call_count = priv->child_count; - local->stripe_size = stripe_get_matching_bs (loc->path, - priv->pattern, - priv->block_size); - - if (priv->xattr_supported) { - dict = dict_new (); - if (!dict) - goto err; - - sprintf (key, "trusted.%s.stripe-size", this->name); - ret = dict_set_int64 (dict, key, 8); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "failed to set %s in xattr_req dict", key); - - sprintf (key, "trusted.%s.stripe-count", this->name); - ret = dict_set_int32 (dict, key, 4); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "failed to set %s in xattr_req dict", key); - - sprintf (key, "trusted.%s.stripe-index", this->name); - ret = dict_set_int32 (dict, key, 4); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "failed to set %s in xattr_req dict", key); - - while (trav) { - STACK_WIND (frame, stripe_open_lookup_cbk, - trav->xlator, trav->xlator->fops->lookup, - loc, dict); - trav = trav->next; - } - if (dict) - dict_unref (dict); - - return 0; - } - local->fctx = GF_CALLOC (1, sizeof (stripe_fd_ctx_t), - gf_stripe_mt_stripe_fd_ctx_t); - if (!local->fctx) { - op_errno = ENOMEM; - goto err; - } - - local->fctx->static_array = 1; - local->fctx->stripe_size = local->stripe_size; - local->fctx->stripe_count = priv->child_count; - local->fctx->xl_array = priv->xl_array; + local->stripe_size = stripe_get_matching_bs (loc->path, priv); while (trav) { STACK_WIND (frame, stripe_open_cbk, trav->xlator, trav->xlator->fops->open, &local->loc, local->flags, local->fd, - wbflags); + xdata); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (open, frame, -1, op_errno, NULL); + STRIPE_STACK_UNWIND (open, frame, -1, op_errno, NULL, NULL); return 0; } int32_t stripe_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -2534,7 +2557,7 @@ stripe_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (!callcnt) { STRIPE_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); + local->op_errno, local->fd, NULL); } out: return 0; @@ -2542,7 +2565,7 @@ out: int32_t -stripe_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) +stripe_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, dict_t *xdata) { xlator_list_t *trav = NULL; stripe_local_t *local = NULL; @@ -2564,8 +2587,7 @@ stripe_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -2576,19 +2598,19 @@ stripe_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) while (trav) { STACK_WIND (frame, stripe_opendir_cbk, trav->xlator, - trav->xlator->fops->opendir, loc, fd); + trav->xlator->fops->opendir, loc, fd, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (opendir, frame, -1, op_errno, NULL); + STRIPE_STACK_UNWIND (opendir, frame, -1, op_errno, NULL, NULL); return 0; } int32_t stripe_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -2628,7 +2650,7 @@ stripe_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (local->failed) local->op_ret = -1; STRIPE_STACK_UNWIND (lk, frame, local->op_ret, - local->op_errno, &local->lock); + local->op_errno, &local->lock, NULL); } out: return 0; @@ -2636,7 +2658,7 @@ out: int32_t stripe_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, - struct gf_flock *lock) + struct gf_flock *lock, dict_t *xdata) { stripe_local_t *local = NULL; xlator_list_t *trav = NULL; @@ -2652,8 +2674,7 @@ stripe_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, priv = this->private; /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -2664,20 +2685,20 @@ stripe_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, while (trav) { STACK_WIND (frame, stripe_lk_cbk, trav->xlator, - trav->xlator->fops->lk, fd, cmd, lock); + trav->xlator->fops->lk, fd, cmd, lock, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (lk, frame, -1, op_errno, NULL); + STRIPE_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); return 0; } int32_t stripe_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -2714,14 +2735,14 @@ stripe_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_ret = -1; STRIPE_STACK_UNWIND (flush, frame, local->op_ret, - local->op_errno); + local->op_errno, NULL); } out: return 0; } int32_t -stripe_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +stripe_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; @@ -2741,8 +2762,7 @@ stripe_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) goto err; } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -2753,13 +2773,13 @@ stripe_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) while (trav) { STACK_WIND (frame, stripe_flush_cbk, trav->xlator, - trav->xlator->fops->flush, fd); + trav->xlator->fops->flush, fd, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (flush, frame, -1, op_errno); + STRIPE_STACK_UNWIND (flush, frame, -1, op_errno, NULL); return 0; } @@ -2768,7 +2788,7 @@ err: int32_t stripe_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -2804,6 +2824,9 @@ stripe_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->prebuf_blocks += prebuf->ia_blocks; local->postbuf_blocks += postbuf->ia_blocks; + correct_file_size(prebuf, local->fctx, prev); + correct_file_size(postbuf, local->fctx, prev); + if (local->prebuf_size < prebuf->ia_size) local->prebuf_size = prebuf->ia_size; @@ -2826,18 +2849,19 @@ stripe_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STRIPE_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno, &local->pre_buf, - &local->post_buf); + &local->post_buf, NULL); } out: return 0; } int32_t -stripe_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags) +stripe_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; xlator_list_t *trav = NULL; + stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = 1; VALIDATE_OR_GOTO (frame, err); @@ -2849,31 +2873,38 @@ stripe_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags) trav = this->children; /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } + + inode_ctx_get(fd->inode, this, (uint64_t *) &fctx); + if (!fctx) { + op_errno = EINVAL; + goto err; + } + local->fctx = fctx; + local->op_ret = -1; frame->local = local; local->call_count = priv->child_count; while (trav) { STACK_WIND (frame, stripe_fsync_cbk, trav->xlator, - trav->xlator->fops->fsync, fd, flags); + trav->xlator->fops->fsync, fd, flags, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL); + STRIPE_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); return 0; } int32_t stripe_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf) + int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -2908,6 +2939,9 @@ stripe_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->stbuf = *buf; local->stbuf_blocks += buf->ia_blocks; + + correct_file_size(buf, local->fctx, prev); + if (local->stbuf_size < buf->ia_size) local->stbuf_size = buf->ia_size; } @@ -2924,7 +2958,7 @@ stripe_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } STRIPE_STACK_UNWIND (fstat, frame, local->op_ret, - local->op_errno, &local->stbuf); + local->op_errno, &local->stbuf, NULL); } out: @@ -2934,11 +2968,12 @@ out: int32_t stripe_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd) + fd_t *fd, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; xlator_list_t *trav = NULL; + stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = 1; VALIDATE_OR_GOTO (frame, err); @@ -2950,8 +2985,7 @@ stripe_fstat (call_frame_t *frame, trav = this->children; /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -2960,26 +2994,35 @@ stripe_fstat (call_frame_t *frame, frame->local = local; local->call_count = priv->child_count; + if (IA_ISREG(fd->inode->ia_type)) { + inode_ctx_get(fd->inode, this, (uint64_t *) &fctx); + if (!fctx) + goto err; + local->fctx = fctx; + } + while (trav) { STACK_WIND (frame, stripe_fstat_cbk, trav->xlator, - trav->xlator->fops->fstat, fd); + trav->xlator->fops->fstat, fd, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (fstat, frame, -1, op_errno, NULL); + STRIPE_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); return 0; } int32_t -stripe_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) +stripe_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; - xlator_list_t *trav = NULL; - int32_t op_errno = 1; + stripe_fd_ctx_t *fctx = NULL; + int i, eof_idx; + off_t dest_offset, tmp_offset; + int32_t op_errno = 1; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -2987,11 +3030,9 @@ stripe_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) VALIDATE_OR_GOTO (fd->inode, err); priv = this->private; - trav = this->children; /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -3000,22 +3041,60 @@ stripe_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) frame->local = local; local->call_count = priv->child_count; - while (trav) { - STACK_WIND (frame, stripe_truncate_cbk, trav->xlator, - trav->xlator->fops->ftruncate, fd, offset); - trav = trav->next; - } + inode_ctx_get(fd->inode, this, (uint64_t *) &fctx); + if (!fctx) { + gf_log(this->name, GF_LOG_ERROR, "no stripe context"); + op_errno = EINVAL; + goto err; + } + if (!fctx->stripe_count) { + gf_log(this->name, GF_LOG_ERROR, "no stripe count"); + op_errno = EINVAL; + goto err; + } + + local->fctx = fctx; + eof_idx = (offset / fctx->stripe_size) % fctx->stripe_count; + + for (i = 0; i < fctx->stripe_count; i++) { + if (!fctx->xl_array[i]) { + gf_log(this->name, GF_LOG_ERROR, "no xlator at index " + "%d", i); + op_errno = EINVAL; + goto err; + } + + if (fctx->stripe_coalesce) { + if (i < eof_idx) + tmp_offset = roof(offset, fctx->stripe_size * + fctx->stripe_count); + else if (i > eof_idx) + tmp_offset = floor(offset, fctx->stripe_size * + fctx->stripe_count); + else + tmp_offset = offset; + + dest_offset = coalesced_offset(tmp_offset, + fctx->stripe_size, fctx->stripe_count); + } else { + dest_offset = offset; + } + + STACK_WIND(frame, stripe_truncate_cbk, fctx->xl_array[i], + fctx->xl_array[i]->fops->ftruncate, fd, dest_offset, + NULL); + } return 0; err: - STRIPE_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL); + STRIPE_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); return 0; } int32_t stripe_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -3052,14 +3131,14 @@ stripe_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_ret = -1; STRIPE_STACK_UNWIND (fsyncdir, frame, local->op_ret, - local->op_errno); + local->op_errno, NULL); } out: return 0; } int32_t -stripe_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags) +stripe_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; @@ -3075,8 +3154,7 @@ stripe_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags) trav = this->children; /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -3087,20 +3165,20 @@ stripe_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags) while (trav) { STACK_WIND (frame, stripe_fsyncdir_cbk, trav->xlator, - trav->xlator->fops->fsyncdir, fd, flags); + trav->xlator->fops->fsyncdir, fd, flags, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (fsyncdir, frame, -1, op_errno); + STRIPE_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); return 0; } int32_t stripe_readv_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf) + int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { int32_t i = 0; int32_t callcnt = 0; @@ -3110,6 +3188,7 @@ stripe_readv_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt tmp_stbuf = {0,}; struct iobref *tmp_iobref = NULL; struct iobuf *iobuf = NULL; + call_frame_t *prev = NULL; if (!this || !frame || !frame->local) { gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); @@ -3117,13 +3196,16 @@ stripe_readv_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } local = frame->local; + prev = cookie; LOCK (&frame->lock); { callcnt = --local->call_count; - if (op_ret != -1) + if (op_ret != -1) { + correct_file_size(buf, local->fctx, prev); if (local->stbuf_size < buf->ia_size) local->stbuf_size = buf->ia_size; + } } UNLOCK (&frame->lock); @@ -3152,7 +3234,8 @@ stripe_readv_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, vec[count].iov_len = (local->replies[i].requested_size - local->replies[i].op_ret); - iobuf = iobuf_get (this->ctx->iobuf_pool); + iobuf = iobuf_get2 (this->ctx->iobuf_pool, + vec[count].iov_len); if (!iobuf) { gf_log (this->name, GF_LOG_ERROR, "Out of memory."); @@ -3161,9 +3244,11 @@ stripe_readv_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto done; } memset (iobuf->ptr, 0, vec[count].iov_len); - iobref_add (local->iobref, iobuf); vec[count].iov_base = iobuf->ptr; + iobref_add (local->iobref, iobuf); + iobuf_unref(iobuf); + op_ret += vec[count].iov_len; count++; } @@ -3181,11 +3266,10 @@ stripe_readv_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, GF_FREE (local->replies); tmp_iobref = local->iobref; STRIPE_STACK_UNWIND (readv, frame, op_ret, op_errno, vec, - count, &tmp_stbuf, tmp_iobref); + count, &tmp_stbuf, tmp_iobref, NULL); iobref_unref (tmp_iobref); - if (vec) - GF_FREE (vec); + GF_FREE (vec); } out: return 0; @@ -3198,7 +3282,7 @@ out: int32_t stripe_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct iatt *stbuf, struct iobref *iobref) + int32_t count, struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) { int32_t index = 0; int32_t callcnt = 0; @@ -3209,8 +3293,10 @@ stripe_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, stripe_local_t *local = NULL; struct iovec *final_vec = NULL; struct iatt tmp_stbuf = {0,}; + struct iatt *tmp_stbuf_p = NULL; //need it for a warning struct iobref *tmp_iobref = NULL; stripe_fd_ctx_t *fctx = NULL; + call_frame_t *prev = NULL; if (!this || !frame || !frame->local || !cookie) { gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); @@ -3219,6 +3305,7 @@ stripe_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; index = local->node_index; + prev = cookie; mframe = local->orig_frame; if (!mframe) goto out; @@ -3239,6 +3326,12 @@ stripe_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, mlocal->replies[index].count = count; mlocal->replies[index].vector = iov_dup (vector, count); + correct_file_size(stbuf, fctx, prev); + + if (local->stbuf_size < stbuf->ia_size) + local->stbuf_size = stbuf->ia_size; + local->stbuf_blocks += stbuf->ia_blocks; + if (!mlocal->iobref) mlocal->iobref = iobref_new (); iobref_merge (mlocal->iobref, iobref); @@ -3295,17 +3388,21 @@ stripe_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, * cause any bugs at higher levels */ memcpy (&tmp_stbuf, &mlocal->replies[0].stbuf, sizeof (struct iatt)); + tmp_stbuf.ia_size = local->stbuf_size; + tmp_stbuf.ia_blocks = local->stbuf_blocks; done: /* */ GF_FREE (mlocal->replies); tmp_iobref = mlocal->iobref; + /* work around for nfs truncated read. Bug 3774 */ + tmp_stbuf_p = &tmp_stbuf; + WIPE (tmp_stbuf_p); STRIPE_STACK_UNWIND (readv, mframe, op_ret, op_errno, final_vec, - final_count, &tmp_stbuf, tmp_iobref); + final_count, &tmp_stbuf, tmp_iobref, NULL); iobref_unref (tmp_iobref); - if (final_vec) - GF_FREE (final_vec); + GF_FREE (final_vec); } goto out; @@ -3317,7 +3414,7 @@ check_size: STACK_WIND (mframe, stripe_readv_fstat_cbk, (fctx->xl_array[index]), (fctx->xl_array[index])->fops->fstat, - mlocal->fd); + mlocal->fd, NULL); } out: @@ -3329,7 +3426,7 @@ end: int32_t stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t offset) + size_t size, off_t offset, uint32_t flags, dict_t *xdata) { int32_t op_errno = EINVAL; int32_t idx = 0; @@ -3342,6 +3439,7 @@ stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, uint64_t stripe_size = 0; off_t rounded_start = 0; off_t frame_offset = offset; + off_t dest_offset = 0; stripe_local_t *local = NULL; call_frame_t *rframe = NULL; stripe_local_t *rlocal = NULL; @@ -3352,7 +3450,7 @@ stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, VALIDATE_OR_GOTO (fd, err); VALIDATE_OR_GOTO (fd->inode, err); - fd_ctx_get (fd, this, &tmp_fctx); + inode_ctx_get (fd->inode, this, &tmp_fctx); if (!tmp_fctx) { op_errno = EBADFD; goto err; @@ -3360,6 +3458,8 @@ stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; stripe_size = fctx->stripe_size; + STRIPE_VALIDATE_FCTX (fctx, err); + if (!stripe_size) { gf_log (this->name, GF_LOG_DEBUG, "Wrong stripe size for the file"); @@ -3374,8 +3474,7 @@ stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, rounded_end = roof (offset+size, stripe_size); num_stripe = (rounded_end- rounded_start)/stripe_size; - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -3383,8 +3482,8 @@ stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, frame->local = local; /* This is where all the vectors should be copied. */ - local->replies = GF_CALLOC (num_stripe, sizeof (struct readv_replies), - gf_stripe_mt_readv_replies); + local->replies = GF_CALLOC (num_stripe, sizeof (struct stripe_replies), + gf_stripe_mt_stripe_replies); if (!local->replies) { op_errno = ENOMEM; goto err; @@ -3399,8 +3498,7 @@ stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, for (index = off_index; index < (num_stripe + off_index); index++) { rframe = copy_frame (frame); - rlocal = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + rlocal = mem_get0 (this->local_pool); if (!rlocal) { op_errno = ENOMEM; goto err; @@ -3414,9 +3512,16 @@ stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, rlocal->readv_size = frame_size; rframe->local = rlocal; idx = (index % fctx->stripe_count); + + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(frame_offset, + stripe_size, fctx->stripe_count); + else + dest_offset = frame_offset; + STACK_WIND (rframe, stripe_readv_cbk, fctx->xl_array[idx], fctx->xl_array[idx]->fops->readv, - fd, frame_size, frame_offset); + fd, frame_size, dest_offset, flags, xdata); frame_offset += frame_size; } @@ -3426,7 +3531,7 @@ err: if (rframe) STRIPE_STACK_DESTROY (rframe); - STRIPE_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL); + STRIPE_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); return 0; } @@ -3434,11 +3539,15 @@ err: int32_t stripe_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; + stripe_local_t *mlocal = NULL; call_frame_t *prev = NULL; + call_frame_t *mframe = NULL; + struct stripe_replies *reply = NULL; + int32_t i = 0; if (!this || !frame || !frame->local || !cookie) { gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); @@ -3447,39 +3556,82 @@ stripe_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, prev = cookie; local = frame->local; + mframe = local->orig_frame; + mlocal = mframe->local; LOCK(&frame->lock); { - callcnt = ++local->call_count; + callcnt = ++mlocal->call_count; + + mlocal->replies[local->node_index].op_ret = op_ret; + mlocal->replies[local->node_index].op_errno = op_errno; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - local->op_errno = op_errno; - local->op_ret = -1; - } if (op_ret >= 0) { - local->op_ret += op_ret; - local->post_buf = *postbuf; - local->pre_buf = *prebuf; + mlocal->post_buf = *postbuf; + mlocal->pre_buf = *prebuf; + + mlocal->prebuf_blocks += prebuf->ia_blocks; + mlocal->postbuf_blocks += postbuf->ia_blocks; + + correct_file_size(prebuf, mlocal->fctx, prev); + correct_file_size(postbuf, mlocal->fctx, prev); + + if (mlocal->prebuf_size < prebuf->ia_size) + mlocal->prebuf_size = prebuf->ia_size; + if (mlocal->postbuf_size < postbuf->ia_size) + mlocal->postbuf_size = postbuf->ia_size; } } UNLOCK (&frame->lock); - if ((callcnt == local->wind_count) && local->unwind) { - STRIPE_STACK_UNWIND (writev, frame, local->op_ret, - local->op_errno, &local->pre_buf, - &local->post_buf); + if ((callcnt == mlocal->wind_count) && mlocal->unwind) { + mlocal->pre_buf.ia_size = mlocal->prebuf_size; + mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks; + mlocal->post_buf.ia_size = mlocal->postbuf_size; + mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks; + + /* + * Only return the number of consecutively written bytes up until + * the first error. Only return an error if it occurs first. + * + * When a short write occurs, the application should retry at the + * appropriate offset, at which point we'll potentially pass back + * the error. + */ + for (i = 0, reply = mlocal->replies; i < mlocal->wind_count; + i++, reply++) { + if (reply->op_ret == -1) { + gf_log(this->name, GF_LOG_DEBUG, "reply %d " + "returned error %s", i, + strerror(reply->op_errno)); + if (!mlocal->op_ret) { + mlocal->op_ret = -1; + mlocal->op_errno = reply->op_errno; + } + break; + } + + mlocal->op_ret += reply->op_ret; + + if (reply->op_ret < reply->requested_size) + break; + } + + GF_FREE(mlocal->replies); + + STRIPE_STACK_UNWIND (writev, mframe, mlocal->op_ret, + mlocal->op_errno, &mlocal->pre_buf, + &mlocal->post_buf, NULL); } out: + STRIPE_STACK_DESTROY(frame); return 0; } int32_t stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref) + uint32_t flags, struct iobref *iobref, dict_t *xdata) { struct iovec *tmp_vec = NULL; stripe_local_t *local = NULL; @@ -3493,13 +3645,19 @@ stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t fill_size = 0; uint64_t stripe_size = 0; uint64_t tmp_fctx = 0; + off_t dest_offset = 0; + off_t rounded_start = 0; + off_t rounded_end = 0; + int32_t total_chunks = 0; + call_frame_t *wframe = NULL; + stripe_local_t *wlocal = NULL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); VALIDATE_OR_GOTO (fd->inode, err); - fd_ctx_get (fd, this, &tmp_fctx); + inode_ctx_get (fd->inode, this, &tmp_fctx); if (!tmp_fctx) { op_errno = EINVAL; goto err; @@ -3507,22 +3665,51 @@ stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; stripe_size = fctx->stripe_size; + STRIPE_VALIDATE_FCTX (fctx, err); + /* File has to be stripped across the child nodes */ for (idx = 0; idx< count; idx ++) { total_size += vector[idx].iov_len; } remaining_size = total_size; - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } frame->local = local; local->stripe_size = stripe_size; + local->fctx = fctx; + if (!stripe_size) { + gf_log (this->name, GF_LOG_DEBUG, + "Wrong stripe size for the file"); + op_errno = EINVAL; + goto err; + } + + rounded_start = floor(offset, stripe_size); + rounded_end = roof(offset + total_size, stripe_size); + total_chunks = (rounded_end - rounded_start) / stripe_size; + local->replies = GF_CALLOC(total_chunks, sizeof(struct stripe_replies), + gf_stripe_mt_stripe_replies); + if (!local->replies) { + op_errno = ENOMEM; + goto err; + } + + total_chunks = 0; while (1) { + wframe = copy_frame(frame); + wlocal = mem_get0(this->local_pool); + if (!wlocal) { + op_errno = ENOMEM; + goto err; + } + wlocal->orig_frame = frame; + wframe->local = wlocal; + /* Send striped chunk of the vector to child nodes appropriately. */ idx = (((offset + offset_offset) / @@ -3550,47 +3737,589 @@ stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, if (remaining_size == 0) local->unwind = 1; - STACK_WIND (frame, stripe_writev_cbk, fctx->xl_array[idx], + /* + * Store off the request index (with respect to the chunk of the + * initial offset) and the size of the request. This is required + * in the callback to calculate an appropriate return value in + * the event of a write failure in one or more requests. + */ + wlocal->node_index = total_chunks; + local->replies[total_chunks].requested_size = fill_size; + + dest_offset = offset + offset_offset; + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(dest_offset, + local->stripe_size, fctx->stripe_count); + + STACK_WIND (wframe, stripe_writev_cbk, fctx->xl_array[idx], fctx->xl_array[idx]->fops->writev, fd, tmp_vec, - tmp_count, offset + offset_offset, iobref); + tmp_count, dest_offset, flags, iobref, + xdata); + GF_FREE (tmp_vec); offset_offset += fill_size; + total_chunks++; if (remaining_size == 0) break; } return 0; err: - STRIPE_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL); + if (wframe) + STRIPE_STACK_DESTROY(wframe); + + STRIPE_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); return 0; } int32_t -stripe_release (xlator_t *this, fd_t *fd) +stripe_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { + int32_t callcnt = 0; + stripe_local_t *local = NULL; + stripe_local_t *mlocal = NULL; + call_frame_t *prev = NULL; + call_frame_t *mframe = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + mframe = local->orig_frame; + mlocal = mframe->local; + + LOCK(&frame->lock); + { + callcnt = ++mlocal->call_count; + + if (op_ret == 0) { + mlocal->post_buf = *postbuf; + mlocal->pre_buf = *prebuf; + + mlocal->prebuf_blocks += prebuf->ia_blocks; + mlocal->postbuf_blocks += postbuf->ia_blocks; + + correct_file_size(prebuf, mlocal->fctx, prev); + correct_file_size(postbuf, mlocal->fctx, prev); + + if (mlocal->prebuf_size < prebuf->ia_size) + mlocal->prebuf_size = prebuf->ia_size; + if (mlocal->postbuf_size < postbuf->ia_size) + mlocal->postbuf_size = postbuf->ia_size; + } + + /* return the first failure */ + if (mlocal->op_ret == 0) { + mlocal->op_ret = op_ret; + mlocal->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if ((callcnt == mlocal->wind_count) && mlocal->unwind) { + mlocal->pre_buf.ia_size = mlocal->prebuf_size; + mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks; + mlocal->post_buf.ia_size = mlocal->postbuf_size; + mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks; + + STRIPE_STACK_UNWIND (fallocate, mframe, mlocal->op_ret, + mlocal->op_errno, &mlocal->pre_buf, + &mlocal->post_buf, NULL); + } +out: + STRIPE_STACK_DESTROY(frame); + return 0; +} + +int32_t +stripe_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + stripe_local_t *local = NULL; + stripe_fd_ctx_t *fctx = NULL; + int32_t op_errno = 1; + int32_t idx = 0; + int32_t offset_offset = 0; + int32_t remaining_size = 0; + off_t fill_size = 0; + uint64_t stripe_size = 0; uint64_t tmp_fctx = 0; + off_t dest_offset = 0; + call_frame_t *fframe = NULL; + stripe_local_t *flocal = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + inode_ctx_get (fd->inode, this, &tmp_fctx); + if (!tmp_fctx) { + op_errno = EINVAL; + goto err; + } + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; + + STRIPE_VALIDATE_FCTX (fctx, err); + + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + frame->local = local; + local->stripe_size = stripe_size; + local->fctx = fctx; + + if (!stripe_size) { + gf_log (this->name, GF_LOG_DEBUG, + "Wrong stripe size for the file"); + op_errno = EINVAL; + goto err; + } + + while (1) { + fframe = copy_frame(frame); + flocal = mem_get0(this->local_pool); + if (!flocal) { + op_errno = ENOMEM; + goto err; + } + flocal->orig_frame = frame; + fframe->local = flocal; + + /* send fallocate request to the associated child node */ + idx = (((offset + offset_offset) / + local->stripe_size) % fctx->stripe_count); + + fill_size = (local->stripe_size - + ((offset + offset_offset) % local->stripe_size)); + if (fill_size > remaining_size) + fill_size = remaining_size; + + remaining_size -= fill_size; + + local->wind_count++; + if (remaining_size == 0) + local->unwind = 1; + + dest_offset = offset + offset_offset; + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(dest_offset, + local->stripe_size, fctx->stripe_count); + + /* + * TODO: Create a separate handler for coalesce mode that sends a + * single fallocate per-child (since the ranges are linear). + */ + STACK_WIND(fframe, stripe_fallocate_cbk, fctx->xl_array[idx], + fctx->xl_array[idx]->fops->fallocate, fd, mode, + dest_offset, fill_size, xdata); + + offset_offset += fill_size; + if (remaining_size == 0) + break; + } + + return 0; +err: + if (fframe) + STRIPE_STACK_DESTROY(fframe); + + STRIPE_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + + +int32_t +stripe_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t callcnt = 0; + stripe_local_t *local = NULL; + stripe_local_t *mlocal = NULL; + call_frame_t *prev = NULL; + call_frame_t *mframe = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + mframe = local->orig_frame; + mlocal = mframe->local; + + LOCK(&frame->lock); + { + callcnt = ++mlocal->call_count; + + if (op_ret == 0) { + mlocal->post_buf = *postbuf; + mlocal->pre_buf = *prebuf; + + mlocal->prebuf_blocks += prebuf->ia_blocks; + mlocal->postbuf_blocks += postbuf->ia_blocks; + + correct_file_size(prebuf, mlocal->fctx, prev); + correct_file_size(postbuf, mlocal->fctx, prev); + + if (mlocal->prebuf_size < prebuf->ia_size) + mlocal->prebuf_size = prebuf->ia_size; + if (mlocal->postbuf_size < postbuf->ia_size) + mlocal->postbuf_size = postbuf->ia_size; + } + + /* return the first failure */ + if (mlocal->op_ret == 0) { + mlocal->op_ret = op_ret; + mlocal->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if ((callcnt == mlocal->wind_count) && mlocal->unwind) { + mlocal->pre_buf.ia_size = mlocal->prebuf_size; + mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks; + mlocal->post_buf.ia_size = mlocal->postbuf_size; + mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks; + + STRIPE_STACK_UNWIND (discard, mframe, mlocal->op_ret, + mlocal->op_errno, &mlocal->pre_buf, + &mlocal->post_buf, NULL); + } +out: + STRIPE_STACK_DESTROY(frame); + return 0; +} + +int32_t +stripe_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + stripe_local_t *local = NULL; stripe_fd_ctx_t *fctx = NULL; + int32_t op_errno = 1; + int32_t idx = 0; + int32_t offset_offset = 0; + int32_t remaining_size = 0; + off_t fill_size = 0; + uint64_t stripe_size = 0; + uint64_t tmp_fctx = 0; + off_t dest_offset = 0; + call_frame_t *fframe = NULL; + stripe_local_t *flocal = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + inode_ctx_get (fd->inode, this, &tmp_fctx); + if (!tmp_fctx) { + op_errno = EINVAL; + goto err; + } + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; + + STRIPE_VALIDATE_FCTX (fctx, err); + + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + frame->local = local; + local->stripe_size = stripe_size; + local->fctx = fctx; + + if (!stripe_size) { + gf_log (this->name, GF_LOG_DEBUG, + "Wrong stripe size for the file"); + op_errno = EINVAL; + goto err; + } + + while (1) { + fframe = copy_frame(frame); + flocal = mem_get0(this->local_pool); + if (!flocal) { + op_errno = ENOMEM; + goto err; + } + flocal->orig_frame = frame; + fframe->local = flocal; + + /* send discard request to the associated child node */ + idx = (((offset + offset_offset) / + local->stripe_size) % fctx->stripe_count); + + fill_size = (local->stripe_size - + ((offset + offset_offset) % local->stripe_size)); + if (fill_size > remaining_size) + fill_size = remaining_size; + + remaining_size -= fill_size; + + local->wind_count++; + if (remaining_size == 0) + local->unwind = 1; + + dest_offset = offset + offset_offset; + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(dest_offset, + local->stripe_size, fctx->stripe_count); + + /* + * TODO: Create a separate handler for coalesce mode that sends a + * single discard per-child (since the ranges are linear). + */ + STACK_WIND(fframe, stripe_discard_cbk, fctx->xl_array[idx], + fctx->xl_array[idx]->fops->discard, fd, dest_offset, + fill_size, xdata); + + offset_offset += fill_size; + if (remaining_size == 0) + break; + } + + return 0; +err: + if (fframe) + STRIPE_STACK_DESTROY(fframe); + + STRIPE_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +stripe_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t callcnt = 0; + stripe_local_t *local = NULL; + stripe_local_t *mlocal = NULL; + call_frame_t *prev = NULL; + call_frame_t *mframe = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + mframe = local->orig_frame; + mlocal = mframe->local; + + LOCK(&frame->lock); + { + callcnt = ++mlocal->call_count; + + if (op_ret == 0) { + mlocal->post_buf = *postbuf; + mlocal->pre_buf = *prebuf; + + mlocal->prebuf_blocks += prebuf->ia_blocks; + mlocal->postbuf_blocks += postbuf->ia_blocks; + + correct_file_size(prebuf, mlocal->fctx, prev); + correct_file_size(postbuf, mlocal->fctx, prev); + + if (mlocal->prebuf_size < prebuf->ia_size) + mlocal->prebuf_size = prebuf->ia_size; + if (mlocal->postbuf_size < postbuf->ia_size) + mlocal->postbuf_size = postbuf->ia_size; + } + + /* return the first failure */ + if (mlocal->op_ret == 0) { + mlocal->op_ret = op_ret; + mlocal->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if ((callcnt == mlocal->wind_count) && mlocal->unwind) { + mlocal->pre_buf.ia_size = mlocal->prebuf_size; + mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks; + mlocal->post_buf.ia_size = mlocal->postbuf_size; + mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks; + + STRIPE_STACK_UNWIND (zerofill, mframe, mlocal->op_ret, + mlocal->op_errno, &mlocal->pre_buf, + &mlocal->post_buf, NULL); + } +out: + STRIPE_STACK_DESTROY(frame); + return 0; +} + +int32_t +stripe_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + stripe_local_t *local = NULL; + stripe_fd_ctx_t *fctx = NULL; + int32_t op_errno = 1; + int32_t idx = 0; + int32_t offset_offset = 0; + int32_t remaining_size = 0; + off_t fill_size = 0; + uint64_t stripe_size = 0; + uint64_t tmp_fctx = 0; + off_t dest_offset = 0; + call_frame_t *fframe = NULL; + stripe_local_t *flocal = NULL; + VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); - fd_ctx_del (fd, this, &tmp_fctx); + inode_ctx_get (fd->inode, this, &tmp_fctx); if (!tmp_fctx) { + op_errno = EINVAL; goto err; } + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; + + STRIPE_VALIDATE_FCTX (fctx, err); + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; - if (!fctx->static_array) - GF_FREE (fctx->xl_array); + STRIPE_VALIDATE_FCTX (fctx, err); - GF_FREE (fctx); + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; + + STRIPE_VALIDATE_FCTX (fctx, err); + + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + frame->local = local; + local->stripe_size = stripe_size; + local->fctx = fctx; + + if (!stripe_size) { + gf_log (this->name, GF_LOG_DEBUG, + "Wrong stripe size for the file"); + op_errno = EINVAL; + goto err; + } + + while (1) { + fframe = copy_frame(frame); + flocal = mem_get0(this->local_pool); + if (!flocal) { + op_errno = ENOMEM; + goto err; + } + flocal->orig_frame = frame; + fframe->local = flocal; + + idx = (((offset + offset_offset) / + local->stripe_size) % fctx->stripe_count); + + fill_size = (local->stripe_size - + ((offset + offset_offset) % local->stripe_size)); + if (fill_size > remaining_size) + fill_size = remaining_size; + + remaining_size -= fill_size; + + local->wind_count++; + if (remaining_size == 0) + local->unwind = 1; + + dest_offset = offset + offset_offset; + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(dest_offset, + local->stripe_size, + fctx->stripe_count); + + STACK_WIND(fframe, stripe_zerofill_cbk, fctx->xl_array[idx], + fctx->xl_array[idx]->fops->zerofill, fd, + dest_offset, fill_size, xdata); + offset_offset += fill_size; + if (remaining_size == 0) + break; + } + return 0; err: + if (fframe) + STRIPE_STACK_DESTROY(fframe); + + STRIPE_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +stripe_release (xlator_t *this, fd_t *fd) +{ return 0; } +int +stripe_forget (xlator_t *this, inode_t *inode) +{ + uint64_t tmp_fctx = 0; + stripe_fd_ctx_t *fctx = NULL; + + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (inode, err); + + (void) inode_ctx_del (inode, this, &tmp_fctx); + if (!tmp_fctx) { + goto err; + } + + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + + if (!fctx->static_array) + GF_FREE (fctx->xl_array); + + GF_FREE (fctx); +err: + return 0; +} int32_t notify (xlator_t *this, int32_t event, void *data, ...) @@ -3598,6 +4327,7 @@ notify (xlator_t *this, int32_t event, void *data, ...) stripe_private_t *priv = NULL; int down_client = 0; int i = 0; + gf_boolean_t heard_from_all_children = _gf_false; if (!this) return 0; @@ -3609,30 +4339,34 @@ notify (xlator_t *this, int32_t event, void *data, ...) switch (event) { case GF_EVENT_CHILD_UP: - case GF_EVENT_CHILD_CONNECTING: { /* get an index number to set */ for (i = 0; i < priv->child_count; i++) { if (data == priv->xl_array[i]) break; } - priv->state[i] = 1; - for (i = 0; i < priv->child_count; i++) { - if (!priv->state[i]) - down_client++; + + if (priv->child_count == i) { + gf_log (this->name, GF_LOG_ERROR, + "got GF_EVENT_CHILD_UP bad subvolume %s", + data? ((xlator_t *)data)->name: NULL); + break; } LOCK (&priv->lock); { - priv->nodes_down = down_client; if (data == FIRST_CHILD (this)) priv->first_child_down = 0; - if (!priv->nodes_down) - default_notify (this, event, data); + priv->last_event[i] = event; } UNLOCK (&priv->lock); } break; + case GF_EVENT_CHILD_CONNECTING: + { + // 'CONNECTING' doesn't ensure its CHILD_UP, so do nothing + goto out; + } case GF_EVENT_CHILD_DOWN: { /* get an index number to set */ @@ -3640,20 +4374,19 @@ notify (xlator_t *this, int32_t event, void *data, ...) if (data == priv->xl_array[i]) break; } - priv->state[i] = 0; - for (i = 0; i < priv->child_count; i++) { - if (!priv->state[i]) - down_client++; + + if (priv->child_count == i) { + gf_log (this->name, GF_LOG_ERROR, + "got GF_EVENT_CHILD_DOWN bad subvolume %s", + data? ((xlator_t *)data)->name: NULL); + break; } LOCK (&priv->lock); { - priv->nodes_down = down_client; - if (data == FIRST_CHILD (this)) priv->first_child_down = 1; - if (priv->nodes_down) - default_notify (this, event, data); + priv->last_event[i] = event; } UNLOCK (&priv->lock); } @@ -3663,79 +4396,252 @@ notify (xlator_t *this, int32_t event, void *data, ...) { /* */ default_notify (this, event, data); + goto out; } break; } + // Consider child as down if it's last_event is not CHILD_UP + for (i = 0, down_client = 0; i < priv->child_count; i++) + if (priv->last_event[i] != GF_EVENT_CHILD_UP) + down_client++; + + LOCK (&priv->lock); + { + priv->nodes_down = down_client; + } + UNLOCK (&priv->lock); + + heard_from_all_children = _gf_true; + for (i = 0; i < priv->child_count; i++) + if (!priv->last_event[i]) + heard_from_all_children = _gf_false; + + if (heard_from_all_children) + default_notify (this, event, data); +out: return 0; } int -set_stripe_block_size (xlator_t *this, stripe_private_t *priv, char *data) +stripe_setxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, dict_t *xdata) { - int ret = -1; - char *tmp_str = NULL; - char *tmp_str1 = NULL; - char *dup_str = NULL; - char *stripe_str = NULL; - char *pattern = NULL; - char *num = NULL; - struct stripe_options *temp_stripeopt = NULL; - struct stripe_options *stripe_opt = NULL; - - if (!this || !priv || !data) - goto out; + int ret = -1; + int call_cnt = 0; + stripe_local_t *local = NULL; - /* Get the pattern for striping. - "option block-size *avi:10MB" etc */ - stripe_str = strtok_r (data, ",", &tmp_str); - while (stripe_str) { - dup_str = gf_strdup (stripe_str); - stripe_opt = CALLOC (1, sizeof (struct stripe_options)); - if (!stripe_opt) { - GF_FREE (dup_str); - goto out; - } + if (!frame || !frame->local || !this) { + gf_log ("", GF_LOG_ERROR, "Possible NULL deref"); + return ret; + } - pattern = strtok_r (dup_str, ":", &tmp_str1); - num = strtok_r (NULL, ":", &tmp_str1); - if (!num) { - num = pattern; - pattern = "*"; - } - if (gf_string2bytesize (num, &stripe_opt->block_size) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "invalid number format \"%s\"", num); - goto out; - } + local = frame->local; - if (stripe_opt->block_size < 512) { - gf_log (this->name, GF_LOG_ERROR, "Invalid Block-size: " - "%s. Should be atleast 512 bytes", num); - goto out; + LOCK (&frame->lock); + { + call_cnt = --local->wind_count; + + /** + * We overwrite ->op_* values here for subsequent faliure + * conditions, hence we propogate the last errno down the + * stack. + */ + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unlock; } - if (stripe_opt->block_size % 512) { - gf_log (this->name, GF_LOG_ERROR, "Block-size: %s should" - " be a multiple of 512 bytes", num); - goto out; + } + + unlock: + UNLOCK (&frame->lock); + + if (!call_cnt) { + STRIPE_STACK_UNWIND (setxattr, frame, local->op_ret, + local->op_errno, xdata); + } + + return 0; +} + +#ifdef HAVE_BD_XLATOR +int +stripe_is_bd (dict_t *this, char *key, data_t *value, void *data) +{ + gf_boolean_t *is_bd = data; + + if (data == NULL) + return 0; + + if (XATTR_IS_BD (key)) + *is_bd = _gf_true; + + return 0; +} + +inline gf_boolean_t +stripe_setxattr_is_bd (dict_t *dict) +{ + gf_boolean_t is_bd = _gf_false; + + if (dict == NULL) + goto out; + + dict_foreach (dict, stripe_is_bd, &is_bd); +out: + return is_bd; +} +#else +#define stripe_setxattr_is_bd(dict) _gf_false +#endif + +int +stripe_setxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *dict, int flags, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + xlator_list_t *trav = NULL; + stripe_private_t *priv = NULL; + stripe_local_t *local = NULL; + int i = 0; + gf_boolean_t is_bd = _gf_false; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + + GF_IF_INTERNAL_XATTR_GOTO ("trusted.*stripe*", dict, + op_errno, err); + + priv = this->private; + trav = this->children; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + frame->local = local; + local->wind_count = priv->child_count; + local->op_ret = local->op_errno = 0; + + is_bd = stripe_setxattr_is_bd (dict); + + /** + * Set xattrs for directories on all subvolumes. Additionally + * this power is only given to a special client. Bd xlator + * also needs xattrs for regular files (ie LVs) + */ + if (((frame->root->pid == GF_CLIENT_PID_GSYNCD) && + IA_ISDIR (loc->inode->ia_type)) || is_bd) { + for (i = 0; i < priv->child_count; i++, trav = trav->next) { + STACK_WIND (frame, stripe_setxattr_cbk, + trav->xlator, trav->xlator->fops->setxattr, + loc, dict, flags, xdata); } + } else { + local->wind_count = 1; + STACK_WIND (frame, stripe_setxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + loc, dict, flags, xdata); + } - memcpy (stripe_opt->path_pattern, pattern, strlen (pattern)); + return 0; +err: + STRIPE_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + return 0; +} - gf_log (this->name, GF_LOG_DEBUG, - "block-size : pattern %s : size %"PRId64, - stripe_opt->path_pattern, stripe_opt->block_size); - if (!priv->pattern) { - priv->pattern = stripe_opt; - } else { - temp_stripeopt = priv->pattern; - while (temp_stripeopt->next) - temp_stripeopt = temp_stripeopt->next; - temp_stripeopt->next = stripe_opt; +int +stripe_fsetxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, dict_t *xdata) +{ + STRIPE_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +stripe_is_special_key (dict_t *this, + char *key, + data_t *value, + void *data) +{ + gf_boolean_t *is_special = NULL; + + if (data == NULL) { + goto out; + } + + is_special = data; + + if (XATTR_IS_LOCKINFO (key) || XATTR_IS_BD (key)) + *is_special = _gf_true; + +out: + return 0; +} + +int32_t +stripe_fsetxattr_everyone_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xdata) +{ + int call_count = 0; + stripe_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + call_count = --local->wind_count; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; } - stripe_str = strtok_r (NULL, ",", &tmp_str); - GF_FREE (dup_str); + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + STRIPE_STACK_UNWIND (fsetxattr, frame, local->op_ret, + local->op_errno, NULL); + } + return 0; +} + +int +stripe_fsetxattr_to_everyone (call_frame_t *frame, xlator_t *this, fd_t *fd, + dict_t *dict, int flags, dict_t *xdata) +{ + xlator_list_t *trav = NULL; + stripe_private_t *priv = NULL; + int ret = -1; + stripe_local_t *local = NULL; + + priv = this->private; + + local = mem_get0 (this->local_pool); + if (local == NULL) { + goto out; + } + + frame->local = local; + + local->wind_count = priv->child_count; + + trav = this->children; + + while (trav) { + STACK_WIND (frame, stripe_fsetxattr_everyone_cbk, + trav->xlator, trav->xlator->fops->fsetxattr, + fd, dict, flags, xdata); + trav = trav->next; } ret = 0; @@ -3743,80 +4649,220 @@ out: return ret; } -int32_t -stripe_iatt_merge (struct iatt *from, struct iatt *to) +inline gf_boolean_t +stripe_fsetxattr_is_special (dict_t *dict) { - if (to->ia_size < from->ia_size) - to->ia_size = from->ia_size; - if (to->ia_mtime < from->ia_mtime) - to->ia_mtime = from->ia_mtime; - if (to->ia_ctime < from->ia_ctime) - to->ia_ctime = from->ia_ctime; - if (to->ia_atime < from->ia_atime) - to->ia_atime = from->ia_atime; - return 0; + gf_boolean_t is_spl = _gf_false; + + if (dict == NULL) { + goto out; + } + + dict_foreach (dict, stripe_is_special_key, &is_spl); + +out: + return is_spl; } -int32_t -stripe_readdirp_entry_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf) +int +stripe_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + dict_t *dict, int flags, dict_t *xdata) { - gf_dirent_t *entry = NULL; - stripe_local_t *local = NULL; - int32_t done = 0; + int32_t op_ret = -1, ret = -1, op_errno = EINVAL; + gf_boolean_t is_spl = _gf_false; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + GF_IF_INTERNAL_XATTR_GOTO ("trusted.*stripe*", dict, + op_errno, err); + + is_spl = stripe_fsetxattr_is_special (dict); + if (is_spl) { + ret = stripe_fsetxattr_to_everyone (frame, this, fd, dict, + flags, xdata); + if (ret < 0) { + op_errno = ENOMEM; + goto err; + } - if (!this || !frame || !frame->local || !cookie) { - gf_log (this->name, GF_LOG_DEBUG, "possible NULL deref"); goto out; } - entry = cookie; + + STACK_WIND (frame, stripe_fsetxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + fd, dict, flags, xdata); +out: + return 0; +err: + STRIPE_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, NULL); + return 0; +} + +int +stripe_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + STRIPE_STACK_UNWIND (removexattr, frame, op_ret, op_errno, xdata); + return 0; +} + +int +stripe_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + + VALIDATE_OR_GOTO (this, err); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.*stripe*", + name, op_errno, err); + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (loc, err); + + STACK_WIND (frame, stripe_removexattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, + loc, name, xdata); + return 0; +err: + STRIPE_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); + return 0; +} + + +int +stripe_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + STRIPE_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, xdata); + return 0; +} + +int +stripe_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.*stripe*", + name, op_errno, err); + + STACK_WIND (frame, stripe_fremovexattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, + fd, name, xdata); + return 0; + err: + STRIPE_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, xdata); + return 0; +} + +int32_t +stripe_readdirp_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, + dict_t *xattr, struct iatt *parent) +{ + stripe_local_t *local = NULL; + call_frame_t *main_frame = NULL; + stripe_local_t *main_local = NULL; + gf_dirent_t *entry = NULL; + call_frame_t *prev = NULL; + int done = 0; + local = frame->local; + prev = cookie; + + entry = local->dirent; + + main_frame = local->orig_frame; + main_local = main_frame->local; LOCK (&frame->lock); { - local->wind_count--; - if (!local->wind_count) + local->call_count--; + if (!local->call_count) done = 1; if (op_ret == -1) { local->op_errno = op_errno; local->op_ret = op_ret; goto unlock; } - stripe_iatt_merge (buf, &entry->d_stat); + + if (stripe_ctx_handle(this, prev, local, xattr)) + gf_log(this->name, GF_LOG_ERROR, + "Error getting fctx info from dict."); + + correct_file_size(stbuf, local->fctx, prev); + + stripe_iatt_merge (stbuf, &entry->d_stat); + local->stbuf_blocks += stbuf->ia_blocks; } unlock: UNLOCK(&frame->lock); if (done) { - frame->local = NULL; - STRIPE_STACK_UNWIND (readdir, frame, local->op_ret, - local->op_errno, &local->entries); + inode_ctx_put (entry->inode, this, + (uint64_t) (long)local->fctx); - gf_dirent_free (&local->entries); + done = 0; + LOCK (&main_frame->lock); + { + main_local->wind_count--; + if (!main_local->wind_count) + done = 1; + if (local->op_ret == -1) { + main_local->op_errno = local->op_errno; + main_local->op_ret = local->op_ret; + } + entry->d_stat.ia_blocks = local->stbuf_blocks; + } + UNLOCK (&main_frame->lock); + if (done) { + main_frame->local = NULL; + STRIPE_STACK_UNWIND (readdir, main_frame, + main_local->op_ret, + main_local->op_errno, + &main_local->entries, NULL); + gf_dirent_free (&main_local->entries); + stripe_local_wipe (main_local); + mem_put (main_local); + } + frame->local = NULL; stripe_local_wipe (local); - GF_FREE (local); + mem_put (local); + STRIPE_STACK_DESTROY (frame); } -out: - return 0; + return 0; } int32_t stripe_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *orig_entries) + int32_t op_ret, int32_t op_errno, + gf_dirent_t *orig_entries, dict_t *xdata) { stripe_local_t *local = NULL; call_frame_t *prev = NULL; gf_dirent_t *local_entry = NULL; - int32_t ret = -1; gf_dirent_t *tmp_entry = NULL; xlator_list_t *trav = NULL; loc_t loc = {0, }; - inode_t *inode = NULL; - char *path; int32_t count = 0; stripe_private_t *priv = NULL; int32_t subvols = 0; + dict_t *xattrs = NULL; + call_frame_t *local_frame = NULL; + stripe_local_t *local_ent = NULL; if (!this || !frame || !frame->local || !cookie) { gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); @@ -3842,8 +4888,9 @@ stripe_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_ret = op_ret; list_splice_init (&orig_entries->list, &local->entries.list); - local->wind_count = op_ret * subvols; + local->wind_count = op_ret; } + } unlock: UNLOCK (&frame->lock); @@ -3851,8 +4898,10 @@ unlock: if (op_ret == -1) goto out; + xattrs = dict_new (); + if (xattrs) + (void) stripe_xattr_request_build (this, xattrs, 0, 0, 0, 0); count = op_ret; - ret = 0; list_for_each_entry_safe (local_entry, tmp_entry, (&local->entries.list), list) { @@ -3861,67 +4910,73 @@ unlock: if (!IA_ISREG (local_entry->d_stat.ia_type)) { LOCK (&frame->lock); { - local->wind_count -= subvols; + local->wind_count--; count = local->wind_count; } UNLOCK (&frame->lock); continue; } - inode = inode_new (local->fd->inode->table); - if (!inode) + local_frame = copy_frame (frame); + + if (!local_frame) { + op_errno = ENOMEM; + op_ret = -1; goto out; + } - loc.ino = inode->ino = local_entry->d_ino; - loc.inode = inode; - loc.parent = local->fd->inode; - ret = inode_path (local->fd->inode, local_entry->d_name, &path); - if (ret != -1) { - loc.path = path; - } else if (inode) { - ret = inode_path (inode, NULL, &path); - if (ret != -1) { - loc.path = path; - } else { - goto out; - } + local_ent = mem_get0 (this->local_pool); + if (!local_ent) { + op_errno = ENOMEM; + op_ret = -1; + goto out; } - loc.name = strrchr (loc.path, '/'); - loc.name++; + loc.inode = inode_ref (local_entry->inode); + + uuid_copy (loc.gfid, local_entry->d_stat.ia_gfid); + + local_ent->orig_frame = frame; + + local_ent->call_count = subvols; + + local_ent->dirent = local_entry; + + local_frame->local = local_ent; + trav = this->children; while (trav) { - STACK_WIND_COOKIE (frame, stripe_readdirp_entry_stat_cbk, - local_entry, trav->xlator, - trav->xlator->fops->stat, &loc); + STACK_WIND (local_frame, stripe_readdirp_lookup_cbk, + trav->xlator, trav->xlator->fops->lookup, + &loc, xattrs); trav = trav->next; } - inode_unref (loc.inode); + loc_wipe (&loc); } out: if (!count) { /* all entries are directories */ frame->local = NULL; STRIPE_STACK_UNWIND (readdir, frame, local->op_ret, - local->op_errno, &local->entries); + local->op_errno, &local->entries, NULL); gf_dirent_free (&local->entries); stripe_local_wipe (local); - GF_FREE (local); + mem_put (local); } - + if (xattrs) + dict_unref (xattrs); return 0; } int32_t stripe_readdirp (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off) + fd_t *fd, size_t size, off_t off, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; xlator_list_t *trav = NULL; int op_errno = -1; - VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); @@ -3935,8 +4990,7 @@ stripe_readdirp (call_frame_t *frame, xlator_t *this, } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -3956,15 +5010,16 @@ stripe_readdirp (call_frame_t *frame, xlator_t *this, goto err; STACK_WIND (frame, stripe_readdirp_cbk, trav->xlator, - trav->xlator->fops->readdirp, fd, size, off); + trav->xlator->fops->readdirp, fd, size, off, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - STRIPE_STACK_UNWIND (readdir, frame, -1, op_errno, NULL); + STRIPE_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL); return 0; } + int32_t mem_acct_init (xlator_t *this) { @@ -3985,21 +5040,86 @@ out: return ret; } +static int +clear_pattern_list (stripe_private_t *priv) +{ + struct stripe_options *prev = NULL; + struct stripe_options *trav = NULL; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("stripe", priv, out); + + trav = priv->pattern; + priv->pattern = NULL; + while (trav) { + prev = trav; + trav = trav->next; + GF_FREE (prev); + } + + ret = 0; + out: + return ret; + + +} + int reconfigure (xlator_t *this, dict_t *options) { - stripe_private_t *priv = NULL; - int ret = -1; + stripe_private_t *priv = NULL; + data_t *data = NULL; + int ret = -1; + volume_option_t *opt = NULL; + + GF_ASSERT (this); + GF_ASSERT (this->private); - priv = this->private; + priv = this->private; - GF_OPTION_RECONF ("block-size", priv->block_size, options, size, out); ret = 0; -out: - return ret; + LOCK (&priv->lock); + { + ret = clear_pattern_list (priv); + if (ret) + goto unlock; + + data = dict_get (options, "block-size"); + if (data) { + ret = set_stripe_block_size (this, priv, data->data); + if (ret) + goto unlock; + } else { + opt = xlator_volume_option_get (this, "block-size"); + if (!opt) { + gf_log (this->name, GF_LOG_WARNING, + "option 'block-size' not found"); + ret = -1; + goto unlock; + } + + if (gf_string2bytesize (opt->default_value, &priv->block_size)){ + gf_log (this->name, GF_LOG_ERROR, + "Unable to set default block-size "); + ret = -1; + goto unlock; + } + } + + GF_OPTION_RECONF("coalesce", priv->coalesce, options, bool, + unlock); + } + unlock: + UNLOCK (&priv->lock); + if (ret) + goto out; + + ret = 0; + out: + return ret; } @@ -4012,6 +5132,7 @@ int32_t init (xlator_t *this) { stripe_private_t *priv = NULL; + volume_option_t *opt = NULL; xlator_list_t *trav = NULL; data_t *data = NULL; int32_t count = 0; @@ -4055,9 +5176,9 @@ init (xlator_t *this) if (!priv->xl_array) goto out; - priv->state = GF_CALLOC (count, sizeof (int8_t), - gf_stripe_mt_int8_t); - if (!priv->state) + priv->last_event = GF_CALLOC (count, sizeof (int), + gf_stripe_mt_int32_t); + if (!priv->last_event) goto out; priv->child_count = count; @@ -4077,30 +5198,56 @@ init (xlator_t *this) goto out; } - - GF_OPTION_INIT ("block-size", priv->block_size, size, out); - - /* option stripe-pattern *avi:1GB,*pdf:4096 */ - data = dict_get (this->options, "block-size"); - if (data) { - ret = set_stripe_block_size (this, priv, data->data); - if (ret) - goto out; + ret = 0; + LOCK (&priv->lock); + { + opt = xlator_volume_option_get (this, "block-size"); + if (!opt) { + gf_log (this->name, GF_LOG_WARNING, + "option 'block-size' not found"); + ret = -1; + goto unlock; + } + if (gf_string2bytesize (opt->default_value, &priv->block_size)){ + gf_log (this->name, GF_LOG_ERROR, + "Unable to set default block-size "); + ret = -1; + goto unlock; + } + /* option stripe-pattern *avi:1GB,*pdf:16K */ + data = dict_get (this->options, "block-size"); + if (data) { + ret = set_stripe_block_size (this, priv, data->data); + if (ret) + goto unlock; + } } + unlock: + UNLOCK (&priv->lock); + if (ret) + goto out; GF_OPTION_INIT ("use-xattr", priv->xattr_supported, bool, out); - /* notify related */ priv->nodes_down = priv->child_count; - this->private = priv; + GF_OPTION_INIT("coalesce", priv->coalesce, bool, out); + + this->local_pool = mem_pool_new (stripe_local_t, 128); + if (!this->local_pool) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + goto out; + } + + this->private = priv; ret = 0; out: if (ret) { if (priv) { - if (priv->xl_array) - GF_FREE (priv->xl_array); + GF_FREE (priv->xl_array); GF_FREE (priv); } } @@ -4124,15 +5271,15 @@ fini (xlator_t *this) priv = this->private; if (priv) { this->private = NULL; - if (priv->xl_array) - GF_FREE (priv->xl_array); + GF_FREE (priv->xl_array); trav = priv->pattern; while (trav) { prev = trav; trav = trav->next; - FREE (prev); + GF_FREE (prev); } + GF_FREE (priv->last_event); LOCK_DESTROY (&priv->lock); GF_FREE (priv); } @@ -4143,17 +5290,50 @@ out: int32_t stripe_getxattr_unwind (call_frame_t *frame, - int op_ret, int op_errno, dict_t *dict) + int op_ret, int op_errno, dict_t *dict, dict_t *xdata) { - STRIPE_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); + STRIPE_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); return 0; } +int +stripe_internal_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, + dict_t *xdata) +{ + + char size_key[256] = {0,}; + char index_key[256] = {0,}; + char count_key[256] = {0,}; + char coalesce_key[256] = {0,}; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (frame->local, out); + + if (!xattr || (op_ret == -1)) + goto out; + + sprintf (size_key, "trusted.%s.stripe-size", this->name); + sprintf (count_key, "trusted.%s.stripe-count", this->name); + sprintf (index_key, "trusted.%s.stripe-index", this->name); + sprintf (coalesce_key, "trusted.%s.stripe-coalesce", this->name); + + dict_del (xattr, size_key); + dict_del (xattr, count_key); + dict_del (xattr, index_key); + dict_del (xattr, coalesce_key); + +out: + STRIPE_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata); + + return 0; + +} int stripe_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr) + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { int call_cnt = 0; stripe_local_t *local = NULL; @@ -4183,92 +5363,39 @@ stripe_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, out: if (!call_cnt) { STRIPE_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno, - local->xattr); + local->xattr, xdata); } return 0; } int32_t -stripe_pathinfo_aggregate (char *buffer, stripe_local_t *local, int32_t *total) -{ - int32_t i = 0; - int32_t ret = -1; - int32_t len = 0; - char *sbuf = NULL; - stripe_xattr_sort_t *xattr = NULL; - - if (!buffer || !local || !local->xattr_list) - goto out; - - sbuf = buffer; - - for (i = 0; i < local->nallocs; i++) { - xattr = local->xattr_list + i; - len = xattr->pathinfo_len; - - if (len && xattr && xattr->pathinfo) { - memcpy (buffer, xattr->pathinfo, len); - buffer += len; - *buffer++ = ' '; - } - } - - *--buffer = '\0'; - if (total) - *total = buffer - sbuf; - ret = 0; - - out: - return ret; -} - -int32_t -stripe_free_pathinfo_str (stripe_local_t *local) +stripe_vgetxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) { - int32_t i = 0; - int32_t ret = -1; - stripe_xattr_sort_t *xattr = NULL; - - if (!local || !local->xattr_list) - goto out; - - for (i = 0; i < local->nallocs; i++) { - xattr = local->xattr_list + i; - - if (xattr && xattr->pathinfo) - GF_FREE (xattr->pathinfo); - } - - ret = 0; - out: - return ret; -} - -int32_t -stripe_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict) { stripe_local_t *local = NULL; int32_t callcnt = 0; int32_t ret = -1; long cky = 0; - char *pathinfo = NULL; - char *pathinfo_serz = NULL; - int32_t padding = 0; - int32_t tlen = 0; - char stripe_size_str[20] = {0,}; + void *xattr_val = NULL; + void *xattr_serz = NULL; stripe_xattr_sort_t *xattr = NULL; dict_t *stripe_xattr = NULL; if (!frame || !frame->local || !this) { - gf_log (this->name, GF_LOG_ERROR, "Possible NULL deref"); + gf_log ("", GF_LOG_ERROR, "Possible NULL deref"); return ret; } local = frame->local; cky = (long) cookie; + if (local->xsel[0] == '\0') { + gf_log (this->name, GF_LOG_ERROR, "Empty xattr in cbk"); + return ret; + } + LOCK (&frame->lock); { callcnt = --local->wind_count; @@ -4277,23 +5404,26 @@ stripe_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, goto out; if (!local->xattr_list) - local->xattr_list = (stripe_xattr_sort_t *) GF_CALLOC (local->nallocs, - sizeof (stripe_xattr_sort_t), - gf_stripe_mt_xattr_sort_t); + local->xattr_list = (stripe_xattr_sort_t *) + GF_CALLOC (local->nallocs, + sizeof (stripe_xattr_sort_t), + gf_stripe_mt_xattr_sort_t); if (local->xattr_list) { - ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); - if (ret) - goto out; - xattr = local->xattr_list + (int32_t) cky; - pathinfo = gf_strdup (pathinfo); + ret = dict_get_ptr_and_len (dict, local->xsel, + &xattr_val, + &xattr->xattr_len); + if (xattr->xattr_len == 0) + goto out; + xattr->pos = cky; - xattr->pathinfo = pathinfo; - xattr->pathinfo_len = strlen (pathinfo); + xattr->xattr_value = gf_memdup (xattr_val, + xattr->xattr_len); - local->xattr_total_len += strlen (pathinfo) + 1; + if (xattr->xattr_value != NULL) + local->xattr_total_len += xattr->xattr_len + 1; } } out: @@ -4307,41 +5437,36 @@ stripe_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, if (!stripe_xattr) goto unwind; - snprintf (stripe_size_str, 20, "%ld", local->stripe_size); - - /* extra bytes for decorations (brackets and <>'s) */ - padding = strlen (this->name) + strlen (STRIPE_PATHINFO_HEADER) - + strlen (stripe_size_str) + 7; - local->xattr_total_len += (padding + 2); - - pathinfo_serz = GF_CALLOC (local->xattr_total_len, sizeof (char), - gf_common_mt_char); - if (!pathinfo_serz) - goto unwind; - - /* xlator info */ - sprintf (pathinfo_serz, "(<"STRIPE_PATHINFO_HEADER"%s:[%s]> ", this->name, stripe_size_str); - - ret = stripe_pathinfo_aggregate (pathinfo_serz + padding, local, &tlen); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Cannot aggregate pathinfo list"); + /* select filler based on ->xsel */ + if (XATTR_IS_PATHINFO (local->xsel)) + ret = stripe_fill_pathinfo_xattr (this, local, + (char **)&xattr_serz); + else if (XATTR_IS_LOCKINFO (local->xsel)) { + ret = stripe_fill_lockinfo_xattr (this, local, + &xattr_serz); + } else { + gf_log (this->name, GF_LOG_WARNING, + "Unknown xattr in xattr request"); goto unwind; } - *(pathinfo_serz + padding + tlen) = ')'; - *(pathinfo_serz + padding + tlen + 1) = '\0'; - - ret = dict_set_dynstr (stripe_xattr, GF_XATTR_PATHINFO_KEY, pathinfo_serz); - if (ret) - gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo key in dict"); + if (!ret) { + ret = dict_set_dynptr (stripe_xattr, local->xsel, + xattr_serz, + local->xattr_total_len); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Can't set %s key in dict", + local->xsel); + } unwind: - STRIPE_STACK_UNWIND (getxattr, frame, op_ret, op_errno, stripe_xattr); + STRIPE_STACK_UNWIND (getxattr, frame, op_ret, op_errno, + stripe_xattr, NULL); - ret = stripe_free_pathinfo_str (local); + ret = stripe_free_xattr_str (local); - if (local->xattr_list) - GF_FREE (local->xattr_list); + GF_FREE (local->xattr_list); if (stripe_xattr) dict_unref (stripe_xattr); @@ -4352,14 +5477,15 @@ stripe_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, int32_t stripe_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) + loc_t *loc, const char *name, dict_t *xdata) { - stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; - stripe_private_t *priv = NULL; - int32_t op_errno = EINVAL; - int i = 0; - xlator_t **sub_volumes; + stripe_local_t *local = NULL; + xlator_list_t *trav = NULL; + stripe_private_t *priv = NULL; + int32_t op_errno = EINVAL; + int i = 0; + xlator_t **sub_volumes; + int ret = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -4371,8 +5497,7 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this, trav = this->children; /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -4383,7 +5508,7 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this, if (name && (strcmp (GF_XATTR_MARKER_KEY, name) == 0) - && (-1 == frame->root->pid)) { + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { local->marker.call_count = priv->child_count; sub_volumes = alloca ( priv->child_count * @@ -4398,7 +5523,8 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this, if (cluster_getmarkerattr (frame, this, loc, name, local, stripe_getxattr_unwind, sub_volumes, priv->child_count, - MARKER_UUID_TYPE, priv->vol_uuid)) { + MARKER_UUID_TYPE, marker_uuid_default_gauge, + priv->vol_uuid)) { op_errno = EINVAL; goto err; } @@ -4414,25 +5540,39 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this, trav = trav->next) { STACK_WIND (frame, stripe_getxattr_cbk, trav->xlator, trav->xlator->fops->getxattr, - loc, name); + loc, name, xdata); } return 0; } - if (name && (strncmp (name, GF_XATTR_PATHINFO_KEY, - strlen (GF_XATTR_PATHINFO_KEY)) == 0)) { - local->stripe_size = stripe_get_matching_bs (loc->path, - priv->pattern, - priv->block_size); + if (name && + ((strncmp (name, GF_XATTR_PATHINFO_KEY, + strlen (GF_XATTR_PATHINFO_KEY)) == 0))) { + if (IA_ISREG (loc->inode->ia_type)) { + ret = inode_ctx_get (loc->inode, this, + (uint64_t *) &local->fctx); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "stripe size unavailable from fctx" + " relying on pathinfo could lead to" + " wrong results"); + } + local->nallocs = local->wind_count = priv->child_count; + (void) strncpy (local->xsel, name, strlen (name)); + /** + * for xattrs that need info from all childs, fill ->xsel + * as above and call the filler function in cbk based on + * it + */ for (i = 0, trav = this->children; i < priv->child_count; i++, trav = trav->next) { - STACK_WIND_COOKIE (frame, stripe_getxattr_pathinfo_cbk, + STACK_WIND_COOKIE (frame, stripe_vgetxattr_cbk, (void *) (long) i, trav->xlator, trav->xlator->fops->getxattr, - loc, name); + loc, name, xdata); } return 0; @@ -4440,46 +5580,128 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this, if (name &&(*priv->vol_uuid)) { if ((match_uuid_local (name, priv->vol_uuid) == 0) - && (-1 == frame->root->pid)) { - local->marker.call_count = priv->child_count; + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { + + if (!IA_FILE_OR_DIR (loc->inode->ia_type)) + local->marker.call_count = 1; + else + local->marker.call_count = priv->child_count; - sub_volumes = alloca ( priv->child_count * - sizeof (xlator_t *)); - for (i = 0, trav = this->children; trav ; - trav = trav->next, i++) { + sub_volumes = alloca (local->marker.call_count * + sizeof (xlator_t *)); + for (i = 0, trav = this->children; + i < local->marker.call_count; + i++, trav = trav->next) { *(sub_volumes + i) = trav->xlator; } if (cluster_getmarkerattr (frame, this, loc, name, - local, stripe_getxattr_unwind, + local, + stripe_getxattr_unwind, sub_volumes, - priv->child_count, + local->marker.call_count, MARKER_XTIME_TYPE, + marker_xtime_default_gauge, priv->vol_uuid)) { op_errno = EINVAL; goto err; } + return 0; } } - STACK_WIND (frame, default_getxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getxattr, loc, name); + STACK_WIND (frame, stripe_internal_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); + + return 0; + +err: + STRIPE_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); + return 0; +} + +inline gf_boolean_t +stripe_is_special_xattr (const char *name) +{ + gf_boolean_t is_spl = _gf_false; + + if (!name) { + goto out; + } + + if (!strncmp (name, GF_XATTR_LOCKINFO_KEY, + strlen (GF_XATTR_LOCKINFO_KEY)) + || !strncmp (name, GF_XATTR_PATHINFO_KEY, + strlen (GF_XATTR_PATHINFO_KEY))) + is_spl = _gf_true; +out: + return is_spl; +} + +int32_t +stripe_fgetxattr_from_everyone (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = NULL; + int32_t ret = -1, op_errno = 0; + int i = 0; + xlator_list_t *trav = NULL; + + priv = this->private; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->op_ret = -1; + frame->local = local; + + strncpy (local->xsel, name, strlen (name)); + local->nallocs = local->wind_count = priv->child_count; + + for (i = 0, trav = this->children; i < priv->child_count; i++, + trav = trav->next) { + STACK_WIND_COOKIE (frame, stripe_vgetxattr_cbk, + (void *) (long) i, trav->xlator, + trav->xlator->fops->fgetxattr, + fd, name, xdata); + } return 0; err: - STRIPE_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL); + STACK_UNWIND_STRICT (fgetxattr, frame, -1, op_errno, NULL, NULL); + return ret; +} + +int32_t +stripe_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + if (stripe_is_special_xattr (name)) { + stripe_fgetxattr_from_everyone (frame, this, fd, name, xdata); + goto out; + } + + STACK_WIND (frame, stripe_internal_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); + +out: return 0; } + + int32_t stripe_priv_dump (xlator_t *this) { - char key_prefix[GF_DUMP_MAX_BUF_LEN]; char key[GF_DUMP_MAX_BUF_LEN]; int i = 0; stripe_private_t *priv = NULL; @@ -4497,39 +5719,26 @@ stripe_priv_dump (xlator_t *this) goto out; gf_proc_dump_add_section("xlator.cluster.stripe.%s.priv", this->name); - gf_proc_dump_build_key(key_prefix,"xlator.cluster.stripe","%s.priv", - this->name); - gf_proc_dump_build_key(key, key_prefix, "child_count"); - gf_proc_dump_write(key,"%d", priv->child_count); + gf_proc_dump_write("child_count","%d", priv->child_count); for (i = 0; i < priv->child_count; i++) { - gf_proc_dump_build_key (key, key_prefix, "subvolumes[%d]", i); + sprintf (key, "subvolumes[%d]", i); gf_proc_dump_write (key, "%s.%s", priv->xl_array[i]->type, priv->xl_array[i]->name); } options = priv->pattern; while (options != NULL) { - gf_proc_dump_build_key (key, key_prefix, "path_pattern"); - gf_proc_dump_write (key, "%s", priv->pattern->path_pattern); - - gf_proc_dump_build_key (key, key_prefix, "options_block_size"); - gf_proc_dump_write (key, "%ul", options->block_size); + gf_proc_dump_write ("path_pattern", "%s", priv->pattern->path_pattern); + gf_proc_dump_write ("options_block_size", "%ul", options->block_size); options = options->next; } - gf_proc_dump_build_key (key, key_prefix, "block_size"); - gf_proc_dump_write (key, "%ul", priv->block_size); - - gf_proc_dump_build_key (key, key_prefix, "nodes_down"); - gf_proc_dump_write (key, "%d", priv->nodes_down); - - gf_proc_dump_build_key (key, key_prefix, "first_child_down"); - gf_proc_dump_write (key, "%d", priv->first_child_down); - - gf_proc_dump_build_key (key, key_prefix, "xatter_supported"); - gf_proc_dump_write (key, "%d", priv->xattr_supported); + gf_proc_dump_write ("block_size", "%ul", priv->block_size); + gf_proc_dump_write ("nodes-down", "%d", priv->nodes_down); + gf_proc_dump_write ("first-child_down", "%d", priv->first_child_down); + gf_proc_dump_write ("xattr_supported", "%d", priv->xattr_supported); UNLOCK (&priv->lock); @@ -4538,36 +5747,44 @@ out: } struct xlator_fops fops = { - .stat = stripe_stat, - .unlink = stripe_unlink, - .rename = stripe_rename, - .link = stripe_link, - .truncate = stripe_truncate, - .create = stripe_create, - .open = stripe_open, - .readv = stripe_readv, - .writev = stripe_writev, - .statfs = stripe_statfs, - .flush = stripe_flush, - .fsync = stripe_fsync, - .ftruncate = stripe_ftruncate, - .fstat = stripe_fstat, - .mkdir = stripe_mkdir, - .rmdir = stripe_rmdir, - .lk = stripe_lk, - .opendir = stripe_opendir, - .fsyncdir = stripe_fsyncdir, - .setattr = stripe_setattr, - .fsetattr = stripe_fsetattr, - .lookup = stripe_lookup, - .mknod = stripe_mknod, - - .getxattr = stripe_getxattr, - .readdirp = stripe_readdirp, + .stat = stripe_stat, + .unlink = stripe_unlink, + .rename = stripe_rename, + .link = stripe_link, + .truncate = stripe_truncate, + .create = stripe_create, + .open = stripe_open, + .readv = stripe_readv, + .writev = stripe_writev, + .statfs = stripe_statfs, + .flush = stripe_flush, + .fsync = stripe_fsync, + .ftruncate = stripe_ftruncate, + .fstat = stripe_fstat, + .mkdir = stripe_mkdir, + .rmdir = stripe_rmdir, + .lk = stripe_lk, + .opendir = stripe_opendir, + .fsyncdir = stripe_fsyncdir, + .setattr = stripe_setattr, + .fsetattr = stripe_fsetattr, + .lookup = stripe_lookup, + .mknod = stripe_mknod, + .setxattr = stripe_setxattr, + .fsetxattr = stripe_fsetxattr, + .getxattr = stripe_getxattr, + .fgetxattr = stripe_fgetxattr, + .removexattr = stripe_removexattr, + .fremovexattr = stripe_fremovexattr, + .readdirp = stripe_readdirp, + .fallocate = stripe_fallocate, + .discard = stripe_discard, + .zerofill = stripe_zerofill, }; struct xlator_cbks cbks = { .release = stripe_release, + .forget = stripe_forget, }; struct xlator_dumpops dumpops = { @@ -4576,8 +5793,9 @@ struct xlator_dumpops dumpops = { struct volume_options options[] = { { .key = {"block-size"}, - .type = GF_OPTION_TYPE_ANY, + .type = GF_OPTION_TYPE_SIZE_LIST, .default_value = "128KB", + .min = STRIPE_MIN_BLOCK_SIZE, .description = "Size of the stripe unit that would be read " "from or written to the striped servers." }, @@ -4585,5 +5803,12 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_BOOL, .default_value = "true" }, + { .key = {"coalesce"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .description = "Enable/Disable coalesce mode to flatten striped " + "files as stored on the server (i.e., eliminate holes " + "caused by the traditional format)." + }, { .key = {NULL} }, }; diff --git a/xlators/cluster/stripe/src/stripe.h b/xlators/cluster/stripe/src/stripe.h index 8d43a960e..5673d18f3 100644 --- a/xlators/cluster/stripe/src/stripe.h +++ b/xlators/cluster/stripe/src/stripe.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -38,36 +29,53 @@ #include <signal.h> #define STRIPE_PATHINFO_HEADER "STRIPE:" - +#define STRIPE_MIN_BLOCK_SIZE (16*GF_UNIT_KB) #define STRIPE_STACK_UNWIND(fop, frame, params ...) do { \ stripe_local_t *__local = NULL; \ - if (frame) { \ - __local = frame->local; \ - frame->local = NULL; \ - } \ - STACK_UNWIND_STRICT (fop, frame, params); \ + if (frame) { \ + __local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT (fop, frame, params); \ + if (__local) { \ + stripe_local_wipe(__local); \ + mem_put (__local); \ + } \ + } while (0) + +#define STRIPE_STACK_DESTROY(frame) do { \ + stripe_local_t *__local = NULL; \ + __local = frame->local; \ + frame->local = NULL; \ + STACK_DESTROY (frame->root); \ if (__local) { \ - stripe_local_wipe(__local); \ - GF_FREE (__local); \ + stripe_local_wipe (__local); \ + mem_put (__local); \ } \ } while (0) -#define STRIPE_STACK_DESTROY(frame) do { \ - stripe_local_t *__local = NULL; \ - __local = frame->local; \ - frame->local = NULL; \ - STACK_DESTROY (frame->root); \ - if (__local) { \ - stripe_local_wipe (__local); \ - GF_FREE (__local); \ - } \ - } while (0) +#define STRIPE_VALIDATE_FCTX(fctx, label) do { \ + int idx = 0; \ + if (!fctx) { \ + op_errno = EINVAL; \ + goto label; \ + } \ + for (idx = 0; idx < fctx->stripe_count; idx++) { \ + if (!fctx->xl_array[idx]) { \ + gf_log (this->name, GF_LOG_ERROR, \ + "fctx->xl_array[%d] is NULL", \ + idx); \ + op_errno = ESTALE; \ + goto label; \ + } \ + } \ + } while (0) typedef struct stripe_xattr_sort { - int32_t pos; - int32_t pathinfo_len; - char *pathinfo; + int pos; + int xattr_len; + char *xattr_value; } stripe_xattr_sort_t; /** @@ -90,16 +98,17 @@ struct stripe_private { gf_lock_t lock; uint8_t nodes_down; int8_t first_child_down; + int *last_event; int8_t child_count; - int8_t *state; /* Current state of child node */ gf_boolean_t xattr_supported; /* default yes */ + gf_boolean_t coalesce; char vol_uuid[UUID_SIZE + 1]; }; /** - * Used to keep info about the replies received from fops->readv calls + * Used to keep info about the replies received from readv/writev calls */ -struct readv_replies { +struct stripe_replies { struct iovec *vector; int32_t count; //count of vector int32_t op_ret; //op_ret of readv @@ -111,6 +120,7 @@ struct readv_replies { typedef struct _stripe_fd_ctx { off_t stripe_size; int stripe_count; + int stripe_coalesce; int static_array; xlator_t **xl_array; } stripe_fd_ctx_t; @@ -146,7 +156,7 @@ struct stripe_local { blkcnt_t preparent_blocks; blkcnt_t postparent_blocks; - struct readv_replies *replies; + struct stripe_replies *replies; struct statvfs statvfs_buf; dir_entry_t *entry; @@ -170,12 +180,15 @@ struct stripe_local { loc_t loc; loc_t loc2; + mode_t mode; + dev_t rdev; /* For File I/O fops */ - dict_t *dict; + dict_t *xdata; stripe_xattr_sort_t *xattr_list; int32_t xattr_total_len; int32_t nallocs; + char xsel[256]; struct marker_str marker; @@ -192,12 +205,84 @@ struct stripe_local { void *value; struct iobref *iobref; gf_dirent_t entries; + gf_dirent_t *dirent; dict_t *xattr; uuid_t ia_gfid; + + int xflag; + mode_t umask; }; typedef struct stripe_local stripe_local_t; typedef struct stripe_private stripe_private_t; +/* + * Determine the stripe index of a particular frame based on the translator. + */ +static inline int32_t stripe_get_frame_index(stripe_fd_ctx_t *fctx, + call_frame_t *prev) +{ + int32_t i, idx = -1; + + for (i = 0; i < fctx->stripe_count; i++) { + if (fctx->xl_array[i] == prev->this) { + idx = i; + break; + } + } + + return idx; +} + +static inline void stripe_copy_xl_array(xlator_t **dst, xlator_t **src, + int count) +{ + int i; + + for (i = 0; i < count; i++) + dst[i] = src[i]; +} + +void stripe_local_wipe (stripe_local_t *local); +int32_t stripe_ctx_handle (xlator_t *this, call_frame_t *prev, + stripe_local_t *local, dict_t *dict); +void stripe_aggregate_xattr (dict_t *dst, dict_t *src); +int32_t stripe_xattr_request_build (xlator_t *this, dict_t *dict, + uint64_t stripe_size, uint32_t stripe_count, + uint32_t stripe_index, + uint32_t stripe_coalesce); +int32_t stripe_get_matching_bs (const char *path, stripe_private_t *priv); +int set_stripe_block_size (xlator_t *this, stripe_private_t *priv, char *data); +int32_t stripe_iatt_merge (struct iatt *from, struct iatt *to); +int32_t stripe_fill_pathinfo_xattr (xlator_t *this, stripe_local_t *local, + char **xattr_serz); +int32_t stripe_free_xattr_str (stripe_local_t *local); +int32_t stripe_xattr_aggregate (char *buffer, stripe_local_t *local, + int32_t *total); +off_t coalesced_offset(off_t offset, uint64_t stripe_size, int stripe_count); +off_t uncoalesced_size(off_t size, uint64_t stripe_size, int stripe_count, + int stripe_index); +int32_t +stripe_fill_lockinfo_xattr (xlator_t *this, stripe_local_t *local, + void **xattr_serz); + +/* + * Adjust the size attribute for files if coalesce is enabled. + */ +static inline void correct_file_size(struct iatt *buf, stripe_fd_ctx_t *fctx, + call_frame_t *prev) +{ + int index; + + if (!IA_ISREG(buf->ia_type)) + return; + + if (!fctx || !fctx->stripe_coalesce) + return; + + index = stripe_get_frame_index(fctx, prev); + buf->ia_size = uncoalesced_size(buf->ia_size, fctx->stripe_size, + fctx->stripe_count, index); +} #endif /* _STRIPE_H_ */ diff --git a/xlators/cluster/unify/Makefile.am b/xlators/cluster/unify/Makefile.am deleted file mode 100644 index d471a3f92..000000000 --- a/xlators/cluster/unify/Makefile.am +++ /dev/null @@ -1,3 +0,0 @@ -SUBDIRS = src - -CLEANFILES = diff --git a/xlators/cluster/unify/src/Makefile.am b/xlators/cluster/unify/src/Makefile.am deleted file mode 100644 index 2a1fe8372..000000000 --- a/xlators/cluster/unify/src/Makefile.am +++ /dev/null @@ -1,16 +0,0 @@ - -xlator_LTLIBRARIES = unify.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/legacy/cluster - -unify_la_LDFLAGS = -module -avoidversion - -unify_la_SOURCES = unify.c unify-self-heal.c -unify_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -noinst_HEADERS = unify.h - -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) - -CLEANFILES = - diff --git a/xlators/cluster/unify/src/unify-mem-types.h b/xlators/cluster/unify/src/unify-mem-types.h deleted file mode 100644 index 13c9cc1f7..000000000 --- a/xlators/cluster/unify/src/unify-mem-types.h +++ /dev/null @@ -1,41 +0,0 @@ - -/* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - - -#ifndef __UNIFY_MEM_TYPES_H__ -#define __UNIFY_MEM_TYPES_H__ - -#include "mem-types.h" - -enum gf_unify_mem_types_ { - gf_unify_mt_char = gf_common_mt_end + 1, - gf_unify_mt_int16_t, - gf_unify_mt_xlator_t, - gf_unify_mt_unify_private_t, - gf_unify_mt_xlator_list_t, - gf_unify_mt_dir_entry_t, - gf_unify_mt_off_t, - gf_unify_mt_int, - gf_unify_mt_unify_self_heal_struct, - gf_unify_mt_unify_local_t, - gf_unify_mt_end -}; -#endif - diff --git a/xlators/cluster/unify/src/unify-self-heal.c b/xlators/cluster/unify/src/unify-self-heal.c deleted file mode 100644 index f99e4c7c3..000000000 --- a/xlators/cluster/unify/src/unify-self-heal.c +++ /dev/null @@ -1,1239 +0,0 @@ -/* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -/** - * unify-self-heal.c : - * This file implements few functions which enables 'unify' translator - * to be consistent in its behaviour when - * > a node fails, - * > a node gets added, - * > a failed node comes back - * > a new namespace server is added (ie, an fresh namespace server). - * - * This functionality of 'unify' will enable glusterfs to support storage - * system failure, and maintain consistancy. This works both ways, ie, when - * an entry (either file or directory) is found on namespace server, and not - * on storage nodes, its created in storage nodes and vica-versa. - * - * The two fops, where it can be implemented are 'getdents ()' and 'lookup ()' - * - */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "unify.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "common-utils.h" - -int32_t -unify_sh_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count); - -int32_t -unify_sh_ns_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count); - -int32_t -unify_bgsh_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count); - -int32_t -unify_bgsh_ns_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count); - -/** - * unify_local_wipe - free all the extra allocation of local->* here. - */ -static void -unify_local_wipe (unify_local_t *local) -{ - /* Free the strdup'd variables in the local structure */ - if (local->name) { - GF_FREE (local->name); - } - - if (local->sh_struct) { - if (local->sh_struct->offset_list) - GF_FREE (local->sh_struct->offset_list); - - if (local->sh_struct->entry_list) - GF_FREE (local->sh_struct->entry_list); - - if (local->sh_struct->count_list) - GF_FREE (local->sh_struct->count_list); - - GF_FREE (local->sh_struct); - } - - loc_wipe (&local->loc1); - loc_wipe (&local->loc2); -} - -int32_t -unify_sh_setdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - inode_t *inode = NULL; - dict_t *tmp_dict = NULL; - dir_entry_t *prev, *entry, *trav; - - LOCK (&frame->lock); - { - /* if local->call_count == 0, that means, setdents on - * storagenodes is still pending. - */ - if (local->call_count) - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (callcnt == 0) { - if (local->sh_struct->entry_list[0]) { - prev = entry = local->sh_struct->entry_list[0]; - if (!entry) - return 0; - trav = entry->next; - while (trav) { - prev->next = trav->next; - GF_FREE (trav->name); - if (IA_ISLNK (trav->buf.ia_type)) - GF_FREE (trav->link); - GF_FREE (trav); - trav = prev->next; - } - GF_FREE (entry); - } - - if (!local->flags) { - if (local->sh_struct->count_list[0] >= - UNIFY_SELF_HEAL_GETDENTS_COUNT) { - /* count == size, that means, there are more entries - to read from */ - //local->call_count = 0; - local->sh_struct->offset_list[0] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - STACK_WIND (frame, - unify_sh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[0], - GF_GET_DIR_ONLY); - } - } else { - inode = local->loc1.inode; - fd_unref (local->fd); - tmp_dict = local->dict; - - unify_local_wipe (local); - - STACK_UNWIND (frame, local->op_ret, local->op_errno, - inode, &local->stbuf, local->dict, - &local->oldpostparent); - if (tmp_dict) - dict_unref (tmp_dict); - } - } - - return 0; -} - - -int32_t -unify_sh_ns_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count) -{ - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = 0; - unsigned long final = 0; - dir_entry_t *tmp = GF_CALLOC (1, sizeof (dir_entry_t), - gf_unify_mt_dir_entry_t); - - local->sh_struct->entry_list[0] = tmp; - local->sh_struct->count_list[0] = count; - if (entry) { - tmp->next = entry->next; - entry->next = NULL; - } - - if ((count < UNIFY_SELF_HEAL_GETDENTS_COUNT) || !entry) { - final = 1; - } - - LOCK (&frame->lock); - { - /* local->call_count will be '0' till now. make it 1 so, it - can be UNWIND'ed for the last call. */ - local->call_count = priv->child_count; - if (final) - local->flags = 1; - } - UNLOCK (&frame->lock); - - for (index = 0; index < priv->child_count; index++) - { - STACK_WIND_COOKIE (frame, - unify_sh_setdents_cbk, - (void *)index, - priv->xl_array[index], - priv->xl_array[index]->fops->setdents, - local->fd, GF_SET_DIR_ONLY, - local->sh_struct->entry_list[0], count); - } - - return 0; -} - -int32_t -unify_sh_ns_setdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = (long)cookie; - dir_entry_t *prev, *entry, *trav; - - LOCK (&frame->lock); - { - if (local->sh_struct->entry_list[index]) { - prev = entry = local->sh_struct->entry_list[index]; - trav = entry->next; - while (trav) { - prev->next = trav->next; - GF_FREE (trav->name); - if (IA_ISLNK (trav->buf.ia_type)) - GF_FREE (trav->link); - GF_FREE (trav); - trav = prev->next; - } - GF_FREE (entry); - } - } - UNLOCK (&frame->lock); - - if (local->sh_struct->count_list[index] < - UNIFY_SELF_HEAL_GETDENTS_COUNT) { - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - } else { - /* count == size, that means, there are more entries - to read from */ - local->sh_struct->offset_list[index] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - STACK_WIND_COOKIE (frame, - unify_sh_getdents_cbk, - cookie, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[index], - GF_GET_ALL); - - gf_log (this->name, GF_LOG_DEBUG, - "readdir on (%s) with offset %"PRId64"", - priv->xl_array[index]->name, - local->sh_struct->offset_list[index]); - } - - if (!callcnt) { - /* All storage nodes have done unified setdents on NS node. - * Now, do getdents from NS and do setdents on storage nodes. - */ - - /* sh_struct->offset_list is no longer required for - storage nodes now */ - local->sh_struct->offset_list[0] = 0; /* reset */ - - STACK_WIND (frame, - unify_sh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_DIR_ONLY); - } - - return 0; -} - - -/** - * unify_sh_getdents_cbk - - */ -int32_t -unify_sh_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = (long)cookie; - dir_entry_t *tmp = NULL; - - if (op_ret >= 0 && count > 0) { - /* There is some dentry found, just send the dentry to NS */ - tmp = GF_CALLOC (1, sizeof (dir_entry_t), - gf_unify_mt_dir_entry_t); - local->sh_struct->entry_list[index] = tmp; - local->sh_struct->count_list[index] = count; - if (entry) { - tmp->next = entry->next; - entry->next = NULL; - } - STACK_WIND_COOKIE (frame, - unify_sh_ns_setdents_cbk, - cookie, - NS(this), - NS(this)->fops->setdents, - local->fd, - GF_SET_IF_NOT_PRESENT, - local->sh_struct->entry_list[index], - count); - return 0; - } - - if (count < UNIFY_SELF_HEAL_GETDENTS_COUNT) { - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - } else { - /* count == size, that means, there are more entries - to read from */ - local->sh_struct->offset_list[index] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - STACK_WIND_COOKIE (frame, - unify_sh_getdents_cbk, - cookie, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[index], - GF_GET_ALL); - - gf_log (this->name, GF_LOG_DEBUG, - "readdir on (%s) with offset %"PRId64"", - priv->xl_array[index]->name, - local->sh_struct->offset_list[index]); - } - - if (!callcnt) { - /* All storage nodes have done unified setdents on NS node. - * Now, do getdents from NS and do setdents on storage nodes. - */ - - /* sh_struct->offset_list is no longer required for - storage nodes now */ - local->sh_struct->offset_list[0] = 0; /* reset */ - - STACK_WIND (frame, - unify_sh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_DIR_ONLY); - } - - return 0; -} - -/** - * unify_sh_opendir_cbk - - * - * @cookie: - */ -int32_t -unify_sh_opendir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - int32_t callcnt = 0; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t index = 0; - inode_t *inode = NULL; - dict_t *tmp_dict = NULL; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret >= 0) { - local->op_ret = op_ret; - } else { - gf_log (this->name, GF_LOG_WARNING, "failed"); - local->failed = 1; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local->call_count = priv->child_count + 1; - - if (!local->failed) { - /* send getdents() namespace after finishing - storage nodes */ - local->call_count--; - - fd_bind (fd); - - if (local->call_count) { - /* Used as the offset index. This list keeps - * track of offset sent to each node during - * STACK_WIND. - */ - local->sh_struct->offset_list = - GF_CALLOC (priv->child_count, - sizeof (off_t), - gf_unify_mt_off_t); - ERR_ABORT (local->sh_struct->offset_list); - - local->sh_struct->entry_list = - GF_CALLOC (priv->child_count, - sizeof (dir_entry_t *), - gf_unify_mt_dir_entry_t); - ERR_ABORT (local->sh_struct->entry_list); - - local->sh_struct->count_list = - GF_CALLOC (priv->child_count, - sizeof (int), - gf_unify_mt_int); - ERR_ABORT (local->sh_struct->count_list); - - /* Send getdents on all the fds */ - for (index = 0; - index < priv->child_count; index++) { - STACK_WIND_COOKIE (frame, - unify_sh_getdents_cbk, - (void *)(long)index, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_ALL); - } - - /* did stack wind, so no need to unwind here */ - return 0; - } /* (local->call_count) */ - } /* (!local->failed) */ - - /* Opendir failed on one node. */ - inode = local->loc1.inode; - fd_unref (local->fd); - tmp_dict = local->dict; - - unify_local_wipe (local); - /* Only 'self-heal' failed, lookup() was successful. */ - local->op_ret = 0; - - /* This is lookup_cbk ()'s UNWIND. */ - STACK_UNWIND (frame, local->op_ret, local->op_errno, inode, - &local->stbuf, local->dict, &local->oldpostparent); - if (tmp_dict) - dict_unref (tmp_dict); - } - - return 0; -} - -/** - * gf_sh_checksum_cbk - - * - * @frame: frame used in lookup. get a copy of it, and use that copy. - * @this: pointer to unify xlator. - * @inode: pointer to inode, for which the consistency check is required. - * - */ -int32_t -unify_sh_checksum_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - uint8_t *file_checksum, - uint8_t *dir_checksum) -{ - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t index = 0; - int32_t callcnt = 0; - inode_t *inode = NULL; - dict_t *tmp_dict = NULL; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret >= 0) { - if (NS(this) == (xlator_t *)cookie) { - memcpy (local->sh_struct->ns_file_checksum, - file_checksum, NAME_MAX); - memcpy (local->sh_struct->ns_dir_checksum, - dir_checksum, NAME_MAX); - } else { - if (local->entry_count == 0) { - /* Initialize the dir_checksum to be - * used for comparision with other - * storage nodes. Should be done for - * the first successful call *only*. - */ - /* Using 'entry_count' as a flag */ - local->entry_count = 1; - memcpy (local->sh_struct->dir_checksum, - dir_checksum, NAME_MAX); - } - - /* Reply from the storage nodes */ - for (index = 0; - index < NAME_MAX; index++) { - /* Files should be present in - only one node */ - local->sh_struct->file_checksum[index] ^= file_checksum[index]; - - /* directory structure should be - same accross */ - if (local->sh_struct->dir_checksum[index] != dir_checksum[index]) - local->failed = 1; - } - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - for (index = 0; index < NAME_MAX ; index++) { - if (local->sh_struct->file_checksum[index] != - local->sh_struct->ns_file_checksum[index]) { - local->failed = 1; - break; - } - if (local->sh_struct->dir_checksum[index] != - local->sh_struct->ns_dir_checksum[index]) { - local->failed = 1; - break; - } - } - - if (local->failed) { - /* Log it, it should be a rare event */ - gf_log (this->name, GF_LOG_WARNING, - "Self-heal triggered on directory %s", - local->loc1.path); - - /* Any self heal will be done at directory level */ - local->call_count = 0; - local->op_ret = -1; - local->failed = 0; - - local->fd = fd_create (local->loc1.inode, - frame->root->pid); - - local->call_count = priv->child_count + 1; - - for (index = 0; - index < (priv->child_count + 1); index++) { - STACK_WIND_COOKIE (frame, - unify_sh_opendir_cbk, - priv->xl_array[index]->name, - priv->xl_array[index], - priv->xl_array[index]->fops->opendir, - &local->loc1, - local->fd); - } - /* opendir can be done on the directory */ - return 0; - } - - /* no mismatch */ - inode = local->loc1.inode; - tmp_dict = local->dict; - - unify_local_wipe (local); - - /* This is lookup_cbk ()'s UNWIND. */ - STACK_UNWIND (frame, - local->op_ret, - local->op_errno, - inode, - &local->stbuf, - local->dict, &local->oldpostparent); - if (tmp_dict) - dict_unref (tmp_dict); - } - - return 0; -} - -/* Foreground self-heal part over */ - -/* Background self-heal part */ - -int32_t -unify_bgsh_setdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - dir_entry_t *prev, *entry, *trav; - - LOCK (&frame->lock); - { - /* if local->call_count == 0, that means, setdents - on storagenodes is still pending. */ - if (local->call_count) - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - - if (callcnt == 0) { - if (local->sh_struct->entry_list[0]) { - prev = entry = local->sh_struct->entry_list[0]; - trav = entry->next; - while (trav) { - prev->next = trav->next; - GF_FREE (trav->name); - if (IA_ISLNK (trav->buf.ia_type)) - GF_FREE (trav->link); - GF_FREE (trav); - trav = prev->next; - } - GF_FREE (entry); - } - - if (!local->flags) { - if (local->sh_struct->count_list[0] >= - UNIFY_SELF_HEAL_GETDENTS_COUNT) { - /* count == size, that means, there are more - entries to read from */ - //local->call_count = 0; - local->sh_struct->offset_list[0] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - STACK_WIND (frame, - unify_bgsh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[0], - GF_GET_DIR_ONLY); - } - } else { - fd_unref (local->fd); - unify_local_wipe (local); - STACK_DESTROY (frame->root); - } - } - - return 0; -} - - -int32_t -unify_bgsh_ns_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count) -{ - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = 0; - unsigned long final = 0; - dir_entry_t *tmp = GF_CALLOC (1, sizeof (dir_entry_t), - gf_unify_mt_dir_entry_t); - - local->sh_struct->entry_list[0] = tmp; - local->sh_struct->count_list[0] = count; - if (entry) { - tmp->next = entry->next; - entry->next = NULL; - } - - if ((count < UNIFY_SELF_HEAL_GETDENTS_COUNT) || !entry) { - final = 1; - } - - LOCK (&frame->lock); - { - /* local->call_count will be '0' till now. make it 1 so, - it can be UNWIND'ed for the last call. */ - local->call_count = priv->child_count; - if (final) - local->flags = 1; - } - UNLOCK (&frame->lock); - - for (index = 0; index < priv->child_count; index++) - { - STACK_WIND_COOKIE (frame, - unify_bgsh_setdents_cbk, - (void *)index, - priv->xl_array[index], - priv->xl_array[index]->fops->setdents, - local->fd, GF_SET_DIR_ONLY, - local->sh_struct->entry_list[0], count); - } - - return 0; -} - -int32_t -unify_bgsh_ns_setdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = (long)cookie; - dir_entry_t *prev, *entry, *trav; - - if (local->sh_struct->entry_list[index]) { - prev = entry = local->sh_struct->entry_list[index]; - if (!entry) - return 0; - trav = entry->next; - while (trav) { - prev->next = trav->next; - GF_FREE (trav->name); - if (IA_ISLNK (trav->buf.ia_type)) - GF_FREE (trav->link); - GF_FREE (trav); - trav = prev->next; - } - GF_FREE (entry); - } - - if (local->sh_struct->count_list[index] < - UNIFY_SELF_HEAL_GETDENTS_COUNT) { - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - } else { - /* count == size, that means, there are more entries - to read from */ - local->sh_struct->offset_list[index] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - STACK_WIND_COOKIE (frame, - unify_bgsh_getdents_cbk, - cookie, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[index], - GF_GET_ALL); - - gf_log (this->name, GF_LOG_DEBUG, - "readdir on (%s) with offset %"PRId64"", - priv->xl_array[index]->name, - local->sh_struct->offset_list[index]); - } - - if (!callcnt) { - /* All storage nodes have done unified setdents on NS node. - * Now, do getdents from NS and do setdents on storage nodes. - */ - - /* sh_struct->offset_list is no longer required for - storage nodes now */ - local->sh_struct->offset_list[0] = 0; /* reset */ - - STACK_WIND (frame, - unify_bgsh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_DIR_ONLY); - } - - return 0; -} - - -/** - * unify_bgsh_getdents_cbk - - */ -int32_t -unify_bgsh_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = (long)cookie; - dir_entry_t *tmp = NULL; - - if (op_ret >= 0 && count > 0) { - /* There is some dentry found, just send the dentry to NS */ - tmp = GF_CALLOC (1, sizeof (dir_entry_t), - gf_unify_mt_dir_entry_t); - local->sh_struct->entry_list[index] = tmp; - local->sh_struct->count_list[index] = count; - if (entry) { - tmp->next = entry->next; - entry->next = NULL; - } - STACK_WIND_COOKIE (frame, - unify_bgsh_ns_setdents_cbk, - cookie, - NS(this), - NS(this)->fops->setdents, - local->fd, - GF_SET_IF_NOT_PRESENT, - local->sh_struct->entry_list[index], - count); - return 0; - } - - if (count < UNIFY_SELF_HEAL_GETDENTS_COUNT) { - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - } else { - /* count == size, that means, there are more entries to read from */ - local->sh_struct->offset_list[index] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - - STACK_WIND_COOKIE (frame, - unify_bgsh_getdents_cbk, - cookie, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[index], - GF_GET_ALL); - - gf_log (this->name, GF_LOG_DEBUG, - "readdir on (%s) with offset %"PRId64"", - priv->xl_array[index]->name, - local->sh_struct->offset_list[index]); - } - - if (!callcnt) { - /* All storage nodes have done unified setdents on NS node. - * Now, do getdents from NS and do setdents on storage nodes. - */ - - /* sh_struct->offset_list is no longer required for - storage nodes now */ - local->sh_struct->offset_list[0] = 0; /* reset */ - - STACK_WIND (frame, - unify_bgsh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_DIR_ONLY); - } - - return 0; -} - -/** - * unify_bgsh_opendir_cbk - - * - * @cookie: - */ -int32_t -unify_bgsh_opendir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int32_t callcnt = 0; - int16_t index = 0; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret >= 0) { - local->op_ret = op_ret; - } else { - local->failed = 1; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local->call_count = priv->child_count + 1; - - if (!local->failed) { - /* send getdents() namespace after finishing - storage nodes */ - local->call_count--; - callcnt = local->call_count; - - fd_bind (fd); - - if (local->call_count) { - /* Used as the offset index. This list keeps - track of offset sent to each node during - STACK_WIND. */ - local->sh_struct->offset_list = - GF_CALLOC (priv->child_count, - sizeof (off_t), - gf_unify_mt_off_t); - ERR_ABORT (local->sh_struct->offset_list); - - local->sh_struct->entry_list = - GF_CALLOC (priv->child_count, - sizeof (dir_entry_t *), - gf_unify_mt_dir_entry_t); - ERR_ABORT (local->sh_struct->entry_list); - - local->sh_struct->count_list = - GF_CALLOC (priv->child_count, - sizeof (int), - gf_unify_mt_int); - ERR_ABORT (local->sh_struct->count_list); - - /* Send getdents on all the fds */ - for (index = 0; - index < priv->child_count; index++) { - STACK_WIND_COOKIE (frame, - unify_bgsh_getdents_cbk, - (void *)(long)index, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_ALL); - } - /* did a stack wind, so no need to unwind here */ - return 0; - } /* (local->call_count) */ - } /* (!local->failed) */ - - /* Opendir failed on one node. */ - fd_unref (local->fd); - - unify_local_wipe (local); - STACK_DESTROY (frame->root); - } - - return 0; -} - -/** - * gf_bgsh_checksum_cbk - - * - * @frame: frame used in lookup. get a copy of it, and use that copy. - * @this: pointer to unify xlator. - * @inode: pointer to inode, for which the consistency check is required. - * - */ -int32_t -unify_bgsh_checksum_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - uint8_t *file_checksum, - uint8_t *dir_checksum) -{ - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t index = 0; - int32_t callcnt = 0; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret >= 0) { - if (NS(this) == (xlator_t *)cookie) { - memcpy (local->sh_struct->ns_file_checksum, - file_checksum, NAME_MAX); - memcpy (local->sh_struct->ns_dir_checksum, - dir_checksum, NAME_MAX); - } else { - if (local->entry_count == 0) { - /* Initialize the dir_checksum to be - * used for comparision with other - * storage nodes. Should be done for - * the first successful call *only*. - */ - /* Using 'entry_count' as a flag */ - local->entry_count = 1; - memcpy (local->sh_struct->dir_checksum, - dir_checksum, NAME_MAX); - } - - /* Reply from the storage nodes */ - for (index = 0; - index < NAME_MAX; index++) { - /* Files should be present in only - one node */ - local->sh_struct->file_checksum[index] ^= file_checksum[index]; - - /* directory structure should be same - accross */ - if (local->sh_struct->dir_checksum[index] != dir_checksum[index]) - local->failed = 1; - } - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - for (index = 0; index < NAME_MAX ; index++) { - if (local->sh_struct->file_checksum[index] != - local->sh_struct->ns_file_checksum[index]) { - local->failed = 1; - break; - } - if (local->sh_struct->dir_checksum[index] != - local->sh_struct->ns_dir_checksum[index]) { - local->failed = 1; - break; - } - } - - if (local->failed) { - /* Log it, it should be a rare event */ - gf_log (this->name, GF_LOG_WARNING, - "Self-heal triggered on directory %s", - local->loc1.path); - - /* Any self heal will be done at the directory level */ - local->op_ret = -1; - local->failed = 0; - - local->fd = fd_create (local->loc1.inode, - frame->root->pid); - local->call_count = priv->child_count + 1; - - for (index = 0; - index < (priv->child_count + 1); index++) { - STACK_WIND_COOKIE (frame, - unify_bgsh_opendir_cbk, - priv->xl_array[index]->name, - priv->xl_array[index], - priv->xl_array[index]->fops->opendir, - &local->loc1, - local->fd); - } - - /* opendir can be done on the directory */ - return 0; - } - - /* no mismatch */ - unify_local_wipe (local); - STACK_DESTROY (frame->root); - } - - return 0; -} - -/* Background self-heal part over */ - - - - -/** - * zr_unify_self_heal - - * - * @frame: frame used in lookup. get a copy of it, and use that copy. - * @this: pointer to unify xlator. - * @inode: pointer to inode, for which the consistency check is required. - * - */ -int32_t -zr_unify_self_heal (call_frame_t *frame, - xlator_t *this, - unify_local_t *local) -{ - unify_private_t *priv = this->private; - call_frame_t *bg_frame = NULL; - unify_local_t *bg_local = NULL; - inode_t *tmp_inode = NULL; - dict_t *tmp_dict = NULL; - int16_t index = 0; - - if (local->inode_generation < priv->inode_generation) { - /* Any self heal will be done at the directory level */ - /* Update the inode's generation to the current generation - value. */ - local->inode_generation = priv->inode_generation; - inode_ctx_put (local->loc1.inode, this, - (uint64_t)(long)local->inode_generation); - - if (priv->self_heal == ZR_UNIFY_FG_SELF_HEAL) { - local->op_ret = 0; - local->failed = 0; - local->call_count = priv->child_count + 1; - local->sh_struct = - GF_CALLOC (1, sizeof (struct unify_self_heal_struct), - gf_unify_mt_unify_self_heal_struct); - - /* +1 is for NS */ - for (index = 0; - index < (priv->child_count + 1); index++) { - STACK_WIND_COOKIE (frame, - unify_sh_checksum_cbk, - priv->xl_array[index], - priv->xl_array[index], - priv->xl_array[index]->fops->checksum, - &local->loc1, - 0); - } - - /* Self-heal in foreground, hence no need - to UNWIND here */ - return 0; - } - - /* Self Heal done in background */ - bg_frame = copy_frame (frame); - INIT_LOCAL (bg_frame, bg_local); - loc_copy (&bg_local->loc1, &local->loc1); - bg_local->op_ret = 0; - bg_local->failed = 0; - bg_local->call_count = priv->child_count + 1; - bg_local->sh_struct = - GF_CALLOC (1, sizeof (struct unify_self_heal_struct), - gf_unify_mt_unify_self_heal_struct); - - /* +1 is for NS */ - for (index = 0; index < (priv->child_count + 1); index++) { - STACK_WIND_COOKIE (bg_frame, - unify_bgsh_checksum_cbk, - priv->xl_array[index], - priv->xl_array[index], - priv->xl_array[index]->fops->checksum, - &bg_local->loc1, - 0); - } - } - - /* generation number matches, self heal already done or - * self heal done in background: just do STACK_UNWIND - */ - tmp_inode = local->loc1.inode; - tmp_dict = local->dict; - - unify_local_wipe (local); - - /* This is lookup_cbk ()'s UNWIND. */ - STACK_UNWIND (frame, - local->op_ret, - local->op_errno, - tmp_inode, - &local->stbuf, - local->dict, - &local->oldpostparent); - - if (tmp_dict) - dict_unref (tmp_dict); - - return 0; -} - diff --git a/xlators/cluster/unify/src/unify.c b/xlators/cluster/unify/src/unify.c deleted file mode 100644 index 6dc93083d..000000000 --- a/xlators/cluster/unify/src/unify.c +++ /dev/null @@ -1,4589 +0,0 @@ -/* - Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -/** - * xlators/cluster/unify: - * - This xlator is one of the main translator in GlusterFS, which - * actually does the clustering work of the file system. One need to - * understand that, unify assumes file to be existing in only one of - * the child node, and directories to be present on all the nodes. - * - * NOTE: - * Now, unify has support for global namespace, which is used to keep a - * global view of fs's namespace tree. The stat for directories are taken - * just from the namespace, where as for files, just 'ia_ino' is taken from - * Namespace node, and other stat info is taken from the actual storage node. - * Also Namespace node helps to keep consistant inode for files across - * glusterfs (re-)mounts. - */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "unify.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "defaults.h" -#include "common-utils.h" -#include <signal.h> -#include <libgen.h> -#include "compat-errno.h" -#include "compat.h" - -#define UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR(_loc) do { \ - if (!(_loc && _loc->inode)) { \ - STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL); \ - return 0; \ - } \ -} while(0) - - -#define UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR(_fd) do { \ - if (!(_fd && !fd_ctx_get (_fd, this, NULL))) { \ - STACK_UNWIND (frame, -1, EBADFD, NULL, NULL); \ - return 0; \ - } \ -} while(0) - -#define UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(_fd) do { \ - if (!_fd) { \ - STACK_UNWIND (frame, -1, EBADFD, NULL, NULL); \ - return 0; \ - } \ -} while(0) - -/** - * unify_local_wipe - free all the extra allocation of local->* here. - */ -static void -unify_local_wipe (unify_local_t *local) -{ - /* Free the strdup'd variables in the local structure */ - if (local->name) { - GF_FREE (local->name); - } - loc_wipe (&local->loc1); - loc_wipe (&local->loc2); -} - - - -/* - * unify_normalize_stats - - */ -void -unify_normalize_stats (struct statvfs *buf, - unsigned long bsize, - unsigned long frsize) -{ - double factor; - - if (buf->f_bsize != bsize) { - factor = ((double) buf->f_bsize) / bsize; - buf->f_bsize = bsize; - buf->f_bfree = (fsblkcnt_t) (factor * buf->f_bfree); - buf->f_bavail = (fsblkcnt_t) (factor * buf->f_bavail); - } - - if (buf->f_frsize != frsize) { - factor = ((double) buf->f_frsize) / frsize; - buf->f_frsize = frsize; - buf->f_blocks = (fsblkcnt_t) (factor * buf->f_blocks); - } -} - - -xlator_t * -unify_loc_subvol (loc_t *loc, xlator_t *this) -{ - unify_private_t *priv = NULL; - xlator_t *subvol = NULL; - int16_t *list = NULL; - long index = 0; - xlator_t *subvol_i = NULL; - int ret = 0; - uint64_t tmp_list = 0; - - priv = this->private; - subvol = NS (this); - - if (!IA_ISDIR (loc->inode->ia_type)) { - ret = inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - if (!list) - goto out; - - for (index = 0; list[index] != -1; index++) { - subvol_i = priv->xl_array[list[index]]; - if (subvol_i != NS (this)) { - subvol = subvol_i; - break; - } - } - } -out: - return subvol; -} - - - -/** - * unify_statfs_cbk - - */ -int32_t -unify_statfs_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct statvfs *stbuf) -{ - int32_t callcnt = 0; - struct statvfs *dict_buf = NULL; - unsigned long bsize; - unsigned long frsize; - unify_local_t *local = (unify_local_t *)frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - if (op_ret >= 0) { - /* when a call is successfull, add it to local->dict */ - dict_buf = &local->statvfs_buf; - - if (dict_buf->f_bsize != 0) { - bsize = max (dict_buf->f_bsize, - stbuf->f_bsize); - - frsize = max (dict_buf->f_frsize, - stbuf->f_frsize); - unify_normalize_stats(dict_buf, bsize, frsize); - unify_normalize_stats(stbuf, bsize, frsize); - } else { - dict_buf->f_bsize = stbuf->f_bsize; - dict_buf->f_frsize = stbuf->f_frsize; - } - - dict_buf->f_blocks += stbuf->f_blocks; - dict_buf->f_bfree += stbuf->f_bfree; - dict_buf->f_bavail += stbuf->f_bavail; - dict_buf->f_files += stbuf->f_files; - dict_buf->f_ffree += stbuf->f_ffree; - dict_buf->f_favail += stbuf->f_favail; - dict_buf->f_fsid = stbuf->f_fsid; - dict_buf->f_flag = stbuf->f_flag; - dict_buf->f_namemax = stbuf->f_namemax; - local->op_ret = op_ret; - } else { - /* fop on storage node has failed due to some error */ - if (op_errno != ENOTCONN) { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): %s", - prev_frame->this->name, - strerror (op_errno)); - } - local->op_errno = op_errno; - } - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->statvfs_buf); - } - - return 0; -} - -/** - * unify_statfs - - */ -int32_t -unify_statfs (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - unify_local_t *local = NULL; - xlator_list_t *trav = this->children; - - INIT_LOCAL (frame, local); - local->call_count = ((unify_private_t *)this->private)->child_count; - - while(trav) { - STACK_WIND (frame, - unify_statfs_cbk, - trav->xlator, - trav->xlator->fops->statfs, - loc); - trav = trav->next; - } - - return 0; -} - -/** - * unify_buf_cbk - - */ -int32_t -unify_buf_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "%s(): child(%s): path(%s): %s", - gf_fop_list[frame->root->op], - prev_frame->this->name, - (local->loc1.path)?local->loc1.path:"", - strerror (op_errno)); - - local->op_errno = op_errno; - if ((op_errno == ENOENT) && priv->optimist) - local->op_ret = 0; - } - - if (op_ret >= 0) { - local->op_ret = 0; - - if (NS (this) == prev_frame->this) { - local->ia_ino = buf->ia_ino; - /* If the entry is directory, get the stat - from NS node */ - if (IA_ISDIR (buf->ia_type) || - !local->stbuf.ia_blksize) { - local->stbuf = *buf; - } - } - - if ((!IA_ISDIR (buf->ia_type)) && - (NS (this) != prev_frame->this)) { - /* If file, take the stat info from Storage - node. */ - local->stbuf = *buf; - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - /* If the inode number is not filled, operation should - fail */ - if (!local->ia_ino) - local->op_ret = -1; - - local->stbuf.ia_ino = local->ia_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stbuf); - } - - return 0; -} - -#define check_if_dht_linkfile(s) \ - ((st_mode_from_ia (s->ia_prot, s->ia_type) & ~S_IFMT) == S_ISVTX) - -/** - * unify_lookup_cbk - - */ -int32_t -unify_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *dict, - struct iatt *postparent) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - inode_t *tmp_inode = NULL; - dict_t *local_dict = NULL; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - if (local->revalidate && - (op_errno == ESTALE)) { - /* ESTALE takes priority */ - local->op_errno = op_errno; - local->failed = 1; - } - - if ((op_errno != ENOTCONN) - && (op_errno != ENOENT) - && (local->op_errno != ESTALE)) { - /* if local->op_errno is already ESTALE, then - * ESTALE has to propogated to the parent first. - * do not enter here. - */ - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - priv->xl_array[(long)cookie]->name, - local->loc1.path, strerror (op_errno)); - local->op_errno = op_errno; - local->failed = 1; - - } else if (local->revalidate && - (local->op_errno != ESTALE) && - !(priv->optimist && (op_errno == ENOENT))) { - - gf_log (this->name, - (op_errno == ENOTCONN) ? - GF_LOG_DEBUG:GF_LOG_ERROR, - "child(%s): path(%s): %s", - priv->xl_array[(long)cookie]->name, - local->loc1.path, strerror (op_errno)); - local->op_errno = op_errno; - local->failed = 1; - } - } - - if (op_ret == 0) { - local->op_ret = 0; - - if (check_if_dht_linkfile(buf)) { - gf_log (this->name, GF_LOG_CRITICAL, - "file %s may be DHT link file on %s, " - "make sure the backend is not shared " - "between unify and DHT", - local->loc1.path, - priv->xl_array[(long)cookie]->name); - } - - if (local->stbuf.ia_type && local->stbuf.ia_blksize) { - /* make sure we already have a stbuf - stored in local->stbuf */ - if (IA_ISDIR (local->stbuf.ia_type) && - !IA_ISDIR (buf->ia_type)) { - gf_log (this->name, GF_LOG_CRITICAL, - "[CRITICAL] '%s' is directory " - "on namespace, non-directory " - "on node '%s', returning EIO", - local->loc1.path, - priv->xl_array[(long)cookie]->name); - local->return_eio = 1; - } - if (!IA_ISDIR (local->stbuf.ia_type) && - IA_ISDIR (buf->ia_type)) { - gf_log (this->name, GF_LOG_CRITICAL, - "[CRITICAL] '%s' is directory " - "on node '%s', non-directory " - "on namespace, returning EIO", - local->loc1.path, - priv->xl_array[(long)cookie]->name); - local->return_eio = 1; - } - } - - if (!local->revalidate && !IA_ISDIR (buf->ia_type)) { - /* This is the first time lookup on file*/ - if (!local->list) { - /* list is not allocated, allocate - the max possible range */ - local->list = GF_CALLOC (1, 2 * (priv->child_count + 2), - gf_unify_mt_int16_t); - if (!local->list) { - gf_log (this->name, - GF_LOG_CRITICAL, - "Not enough memory"); - STACK_UNWIND (frame, -1, - ENOMEM, inode, - NULL, NULL, NULL); - return 0; - } - } - /* update the index of the list */ - local->list [local->index++] = - (int16_t)(long)cookie; - } - - if (!local->revalidate && IA_ISDIR (buf->ia_type)) { - /* fresh lookup of a directory */ - inode_ctx_put (local->loc1.inode, this, - priv->inode_generation); - } - - if ((!local->dict) && dict && - (priv->xl_array[(long)cookie] != NS(this))) { - local->dict = dict_ref (dict); - } - - /* index of NS node is == total child count */ - if (priv->child_count == (int16_t)(long)cookie) { - /* Take the inode number from namespace */ - local->ia_ino = buf->ia_ino; - if (IA_ISDIR (buf->ia_type) || - !(local->stbuf.ia_blksize)) { - local->stbuf = *buf; - local->oldpostparent = *postparent; - } - } else if (!IA_ISDIR (buf->ia_type)) { - /* If file, then get the stat from - storage node */ - local->stbuf = *buf; - } - - if (local->ia_nlink < buf->ia_nlink) { - local->ia_nlink = buf->ia_nlink; - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local_dict = local->dict; - if (local->return_eio) { - gf_log (this->name, GF_LOG_CRITICAL, - "[CRITICAL] Unable to fix the path (%s) with " - "self-heal, try manual verification. " - "returning EIO.", local->loc1.path); - unify_local_wipe (local); - STACK_UNWIND (frame, -1, EIO, inode, NULL, NULL); - if (local_dict) { - dict_unref (local_dict); - } - return 0; - } - - if (!local->stbuf.ia_blksize) { - /* Inode not present */ - local->op_ret = -1; - } else { - if (!local->revalidate && - !IA_ISDIR (local->stbuf.ia_type)) { - /* If its a file, big array is useless, - allocate the smaller one */ - int16_t *list = NULL; - list = GF_CALLOC (1, 2 * (local->index + 1), - gf_unify_mt_int16_t); - ERR_ABORT (list); - memcpy (list, local->list, 2 * local->index); - /* Make the end of the list as -1 */ - GF_FREE (local->list); - local->list = list; - local->list [local->index] = -1; - /* Update the inode's ctx with proper array */ - /* TODO: log on failure */ - inode_ctx_put (local->loc1.inode, this, - (uint64_t)(long)local->list); - } - - if (IA_ISDIR(local->loc1.inode->ia_type)) { - /* lookup is done for directory */ - if (local->failed && priv->self_heal) { - /* Triggering self-heal */ - /* means, self-heal required for this - inode */ - local->inode_generation = 0; - priv->inode_generation++; - } - } else { - local->stbuf.ia_ino = local->ia_ino; - } - - local->stbuf.ia_nlink = local->ia_nlink; - } - if (local->op_ret == -1) { - if (!local->revalidate && local->list) - GF_FREE (local->list); - } - - if ((local->op_ret >= 0) && local->failed && - local->revalidate) { - /* Done revalidate, but it failed */ - if ((op_errno != ENOTCONN) - && (local->op_errno != ESTALE)) { - gf_log (this->name, GF_LOG_ERROR, - "Revalidate failed for path(%s): %s", - local->loc1.path, strerror (op_errno)); - } - local->op_ret = -1; - } - - if ((priv->self_heal && !priv->optimist) && - (!local->revalidate && (local->op_ret == 0) && - IA_ISDIR(local->stbuf.ia_type))) { - /* Let the self heal be done here */ - zr_unify_self_heal (frame, this, local); - local_dict = NULL; - } else { - if (local->failed) { - /* NOTE: directory lookup is sent to all - * subvolumes and success from a subvolume - * might set local->op_ret to 0 (zero) */ - local->op_ret = -1; - } - - /* either no self heal, or op_ret == -1 (failure) */ - tmp_inode = local->loc1.inode; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - tmp_inode, &local->stbuf, local->dict, - &local->oldpostparent); - } - if (local_dict) { - dict_unref (local_dict); - } - } - - return 0; -} - -/** - * unify_lookup - - */ -int32_t -unify_lookup (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *xattr_req) -{ - unify_local_t *local = NULL; - unify_private_t *priv = this->private; - int16_t *list = NULL; - long index = 0; - - if (!(loc && loc->inode)) { - gf_log (this->name, GF_LOG_ERROR, - "%s: Argument not right", loc?loc->path:"(null)"); - STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL, NULL); - return 0; - } - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL, NULL, NULL); - return 0; - } - - if (inode_ctx_get (loc->inode, this, NULL) - && IA_ISDIR (loc->inode->ia_type)) { - local->revalidate = 1; - } - - if (!inode_ctx_get (loc->inode, this, NULL) && - loc->inode->ia_type && - !IA_ISDIR (loc->inode->ia_type)) { - uint64_t tmp_list = 0; - /* check if revalidate or fresh lookup */ - inode_ctx_get (loc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - } - - if (local->list) { - list = local->list; - for (index = 0; list[index] != -1; index++); - if (index != 2) { - if (index < 2) { - gf_log (this->name, GF_LOG_ERROR, - "returning ESTALE for %s: file " - "count is %ld", loc->path, index); - /* Print where all the file is present */ - for (index = 0; - local->list[index] != -1; index++) { - gf_log (this->name, GF_LOG_ERROR, - "%s: found on %s", loc->path, - priv->xl_array[list[index]]->name); - } - unify_local_wipe (local); - STACK_UNWIND (frame, -1, ESTALE, - NULL, NULL, NULL, NULL); - return 0; - } else { - /* There are more than 2 presences */ - /* Just log and continue */ - gf_log (this->name, GF_LOG_ERROR, - "%s: file count is %ld", - loc->path, index); - /* Print where all the file is present */ - for (index = 0; - local->list[index] != -1; index++) { - gf_log (this->name, GF_LOG_ERROR, - "%s: found on %s", loc->path, - priv->xl_array[list[index]]->name); - } - } - } - - /* is revalidate */ - local->revalidate = 1; - - for (index = 0; list[index] != -1; index++) - local->call_count++; - - for (index = 0; list[index] != -1; index++) { - char need_break = (list[index+1] == -1); - STACK_WIND_COOKIE (frame, - unify_lookup_cbk, - (void *)(long)list[index], //cookie - priv->xl_array [list[index]], - priv->xl_array [list[index]]->fops->lookup, - loc, - xattr_req); - if (need_break) - break; - } - } else { - if (loc->inode->ia_type) { - if (inode_ctx_get (loc->inode, this, NULL)) { - inode_ctx_get (loc->inode, this, - &local->inode_generation); - } - } - /* This is first call, there is no list */ - /* call count should be all child + 1 namespace */ - local->call_count = priv->child_count + 1; - - for (index = 0; index <= priv->child_count; index++) { - STACK_WIND_COOKIE (frame, - unify_lookup_cbk, - (void *)index, //cookie - priv->xl_array[index], - priv->xl_array[index]->fops->lookup, - loc, - xattr_req); - } - } - - return 0; -} - -/** - * unify_stat - if directory, get the stat directly from NameSpace child. - * if file, check for a hint and send it only there (also to NS). - * if its a fresh stat, then do it on all the nodes. - * - * NOTE: for all the call, sending cookie as xlator pointer, which will be - * used in cbk. - */ -int32_t -unify_stat (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - unify_local_t *local = NULL; - unify_private_t *priv = this->private; - int16_t index = 0; - int16_t *list = NULL; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; - } - local->ia_ino = loc->inode->ino; - if (IA_ISDIR (loc->inode->ia_type)) { - /* Directory */ - local->call_count = 1; - STACK_WIND (frame, unify_buf_cbk, NS(this), - NS(this)->fops->stat, loc); - } else { - /* File */ - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) - local->call_count++; - - for (index = 0; list[index] != -1; index++) { - char need_break = (list[index+1] == -1); - STACK_WIND (frame, - unify_buf_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->stat, - loc); - if (need_break) - break; - } - } - - return 0; -} - -/** - * unify_access_cbk - - */ -int32_t -unify_access_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - - -/** - * unify_access - Send request to only namespace, which has all the - * attributes set for the file. - */ -int32_t -unify_access (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t mask) -{ - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - STACK_WIND (frame, - unify_access_cbk, - NS(this), - NS(this)->fops->access, - loc, - mask); - - return 0; -} - -int32_t -unify_mkdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - inode_t *tmp_inode = NULL; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if ((op_ret == -1) && !(priv->optimist && - (op_errno == ENOENT || - op_errno == EEXIST))) { - /* TODO: Decrement the inode_generation of - * this->inode's parent inode, hence the missing - * directory is created properly by self-heal. - * Currently, there is no way to get the parent - * inode directly. - */ - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - priv->xl_array[(long)cookie]->name, - local->loc1.path, strerror (op_errno)); - if (op_errno != EEXIST) - local->failed = 1; - local->op_errno = op_errno; - } - - if (op_ret >= 0) - local->op_ret = 0; - - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (!local->failed) { - inode_ctx_put (local->loc1.inode, this, - priv->inode_generation); - } - - tmp_inode = local->loc1.inode; - unify_local_wipe (local); - - STACK_UNWIND (frame, local->op_ret, local->op_errno, - tmp_inode, &local->stbuf, - &local->oldpreparent, &local->oldpostparent); - } - - return 0; -} - -/** - * unify_ns_mkdir_cbk - - */ -int32_t -unify_ns_mkdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - long index = 0; - - if (op_ret == -1) { - /* No need to send mkdir request to other servers, - * as namespace action failed - */ - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s): %s", - local->name, strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, NULL, - NULL, NULL); - return 0; - } - - /* Create one inode for this entry */ - local->op_ret = 0; - local->stbuf = *buf; - - local->oldpreparent = *preparent; - local->oldpostparent = *postparent; - - local->call_count = priv->child_count; - - /* Send mkdir request to all the nodes now */ - for (index = 0; index < priv->child_count; index++) { - STACK_WIND_COOKIE (frame, - unify_mkdir_cbk, - (void *)index, //cookie - priv->xl_array[index], - priv->xl_array[index]->fops->mkdir, - &local->loc1, - local->mode); - } - - return 0; -} - - -/** - * unify_mkdir - - */ -int32_t -unify_mkdir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode) -{ - unify_local_t *local = NULL; - - /* Initialization */ - INIT_LOCAL (frame, local); - local->mode = mode; - - loc_copy (&local->loc1, loc); - - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL); - return 0; - } - - STACK_WIND (frame, - unify_ns_mkdir_cbk, - NS(this), - NS(this)->fops->mkdir, - loc, - mode); - return 0; -} - -/** - * unify_rmdir_cbk - - */ -int32_t -unify_rmdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret == 0 || (priv->optimist && (op_errno == ENOENT))) - local->op_ret = 0; - if (op_ret == -1) - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->oldpreparent, &local->oldpostparent); - } - - return 0; -} - -/** - * unify_ns_rmdir_cbk - - */ -int32_t -unify_ns_rmdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - int16_t index = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - if (op_ret == -1) { - /* No need to send rmdir request to other servers, - * as namespace action failed - */ - gf_log (this->name, - ((op_errno != ENOTEMPTY) ? - GF_LOG_ERROR : GF_LOG_DEBUG), - "namespace: path(%s): %s", - local->loc1.path, strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, NULL, NULL); - return 0; - } - - local->call_count = priv->child_count; - - local->oldpreparent = *preparent; - local->oldpostparent = *postparent; - - for (index = 0; index < priv->child_count; index++) { - STACK_WIND (frame, - unify_rmdir_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->rmdir, - &local->loc1); - } - - return 0; -} - -/** - * unify_rmdir - - */ -int32_t -unify_rmdir (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - unify_local_t *local = NULL; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - - loc_copy (&local->loc1, loc); - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL); - return 0; - } - - STACK_WIND (frame, - unify_ns_rmdir_cbk, - NS(this), - NS(this)->fops->rmdir, - loc); - - return 0; -} - -/** - * unify_open_cbk - - */ -int32_t -unify_open_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - int32_t callcnt = 0; - unify_local_t *local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret >= 0) { - local->op_ret = op_ret; - if (NS(this) != (xlator_t *)cookie) { - /* Store child node's ptr, used in - all the f*** / FileIO calls */ - fd_ctx_set (fd, this, (uint64_t)(long)cookie); - } - } - if (op_ret == -1) { - local->op_errno = op_errno; - local->failed = 1; - } - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if ((local->failed == 1) && (local->op_ret >= 0)) { - local->call_count = 1; - /* return -1 to user */ - local->op_ret = -1; - //local->op_errno = EIO; - - if (!fd_ctx_get (local->fd, this, NULL)) { - gf_log (this->name, GF_LOG_ERROR, - "Open success on child node, " - "failed on namespace"); - } else { - gf_log (this->name, GF_LOG_ERROR, - "Open success on namespace, " - "failed on child node"); - } - } - - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, - local->op_errno, local->fd); - } - - return 0; -} - -#ifdef GF_DARWIN_HOST_OS -/** - * unify_create_lookup_cbk - - */ -int32_t -unify_open_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *dict, - struct iatt *postparent) -{ - int32_t callcnt = 0; - int16_t index = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if ((op_ret == -1) && (op_errno != ENOENT)) { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - priv->xl_array[(long)cookie]->name, - local->loc1.path, strerror (op_errno)); - local->op_errno = op_errno; - } - - if (op_ret >= 0) { - local->op_ret = op_ret; - local->index++; - if (NS(this) == priv->xl_array[(long)cookie]) { - local->list[0] = (int16_t)(long)cookie; - } else { - local->list[1] = (int16_t)(long)cookie; - } - if (IA_ISDIR (buf->ia_type)) - local->failed = 1; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - int16_t file_list[3] = {0,}; - local->op_ret = -1; - - file_list[0] = local->list[0]; - file_list[1] = local->list[1]; - file_list[2] = -1; - - if (local->index != 2) { - /* Lookup failed, can't do open */ - gf_log (this->name, GF_LOG_ERROR, - "%s: present on %d nodes", - local->name, local->index); - - if (local->index < 2) { - unify_local_wipe (local); - gf_log (this->name, GF_LOG_ERROR, - "returning as file found on less " - "than 2 nodes"); - STACK_UNWIND (frame, local->op_ret, - local->op_errno, local->fd); - return 0; - } - } - - if (local->failed) { - /* Open on directory, return EISDIR */ - unify_local_wipe (local); - STACK_UNWIND (frame, -1, EISDIR, local->fd); - return 0; - } - - /* Everything is perfect :) */ - local->call_count = 2; - - for (index = 0; file_list[index] != -1; index++) { - char need_break = (file_list[index+1] == -1); - STACK_WIND_COOKIE (frame, - unify_open_cbk, - priv->xl_array[file_list[index]], - priv->xl_array[file_list[index]], - priv->xl_array[file_list[index]]->fops->open, - &local->loc1, - local->flags, - local->fd, local->wbflags); - if (need_break) - break; - } - } - - return 0; -} - - -int32_t -unify_open_readlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - const char *path, - struct iatt *sbuf) -{ - int16_t index = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - if (op_ret == -1) { - STACK_UNWIND (frame, -1, ENOENT); - return 0; - } - - if (path[0] == '/') { - local->name = gf_strdup (path); - ERR_ABORT (local->name); - } else { - char *tmp_str = gf_strdup (local->loc1.path); - char *tmp_base = dirname (tmp_str); - local->name = GF_CALLOC (1, ZR_PATH_MAX, gf_unify_mt_char); - strcpy (local->name, tmp_base); - strncat (local->name, "/", 1); - strcat (local->name, path); - GF_FREE (tmp_str); - } - - local->list = GF_CALLOC (1, sizeof (int16_t) * 3, - gf_unify_mt_int16_t); - ERR_ABORT (local->list); - local->call_count = priv->child_count + 1; - local->op_ret = -1; - for (index = 0; index <= priv->child_count; index++) { - /* Send the lookup to all the nodes including namespace */ - STACK_WIND_COOKIE (frame, - unify_open_lookup_cbk, - (void *)(long)index, - priv->xl_array[index], - priv->xl_array[index]->fops->lookup, - &local->loc1, - NULL); - } - - return 0; -} -#endif /* GF_DARWIN_HOST_OS */ - -/** - * unify_open - - */ -int32_t -unify_open (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, - fd_t *fd, - int32_t wbflags) -{ - unify_private_t *priv = this->private; - unify_local_t *local = NULL; - int16_t *list = NULL; - int16_t index = 0; - int16_t file_list[3] = {0,}; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Init */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - local->fd = fd; - local->flags = flags; - local->wbflags = wbflags; - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - local->list = list; - file_list[0] = priv->child_count; /* Thats namespace */ - file_list[2] = -1; - for (index = 0; list[index] != -1; index++) { - local->call_count++; - if (list[index] != priv->child_count) - file_list[1] = list[index]; - } - - if (local->call_count != 2) { - /* If the lookup was done for file */ - gf_log (this->name, GF_LOG_ERROR, - "%s: entry_count is %d", - loc->path, local->call_count); - for (index = 0; local->list[index] != -1; index++) - gf_log (this->name, GF_LOG_ERROR, "%s: found on %s", - loc->path, priv->xl_array[list[index]]->name); - - if (local->call_count < 2) { - gf_log (this->name, GF_LOG_ERROR, - "returning EIO as file found on onlyone node"); - STACK_UNWIND (frame, -1, EIO, fd); - return 0; - } - } - -#ifdef GF_DARWIN_HOST_OS - /* Handle symlink here */ - if (IA_ISLNK (loc->inode->ia_type)) { - /* Callcount doesn't matter here */ - STACK_WIND (frame, - unify_open_readlink_cbk, - NS(this), - NS(this)->fops->readlink, - loc, ZR_PATH_MAX); - return 0; - } -#endif /* GF_DARWIN_HOST_OS */ - - local->call_count = 2; - for (index = 0; file_list[index] != -1; index++) { - char need_break = (file_list[index+1] == -1); - STACK_WIND_COOKIE (frame, - unify_open_cbk, - priv->xl_array[file_list[index]], //cookie - priv->xl_array[file_list[index]], - priv->xl_array[file_list[index]]->fops->open, - loc, - flags, - fd, wbflags); - if (need_break) - break; - } - - return 0; -} - - -int32_t -unify_create_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_local_t *local = frame->local; - inode_t *inode = local->loc1.inode; - - unify_local_wipe (local); - - STACK_UNWIND (frame, local->op_ret, local->op_errno, local->fd, - inode, &local->stbuf, - &local->oldpreparent, &local->oldpostparent); - - return 0; -} - -/** - * unify_create_open_cbk - - */ -int32_t -unify_create_open_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - int ret = 0; - int32_t callcnt = 0; - unify_local_t *local = frame->local; - inode_t *inode = NULL; - xlator_t *child = NULL; - uint64_t tmp_value = 0; - - LOCK (&frame->lock); - { - if (op_ret >= 0) { - local->op_ret = op_ret; - if (NS(this) != (xlator_t *)cookie) { - /* Store child node's ptr, used in all - the f*** / FileIO calls */ - /* TODO: log on failure */ - ret = fd_ctx_get (fd, this, &tmp_value); - cookie = (void *)(long)tmp_value; - } else { - /* NOTE: open successful on namespace. - * fd's ctx can be used to identify open - * failure on storage subvolume. cool - * ide ;) */ - local->failed = 0; - } - } else { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - ((xlator_t *)cookie)->name, - local->loc1.path, strerror (op_errno)); - local->op_errno = op_errno; - local->failed = 1; - } - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed == 1 && (local->op_ret >= 0)) { - local->call_count = 1; - /* return -1 to user */ - local->op_ret = -1; - local->op_errno = EIO; - local->fd = fd; - local->call_count = 1; - - if (!fd_ctx_get (local->fd, this, &tmp_value)) { - child = (xlator_t *)(long)tmp_value; - - gf_log (this->name, GF_LOG_ERROR, - "Create success on child node, " - "failed on namespace"); - - STACK_WIND (frame, - unify_create_unlink_cbk, - child, - child->fops->unlink, - &local->loc1); - } else { - gf_log (this->name, GF_LOG_ERROR, - "Create success on namespace, " - "failed on child node"); - - STACK_WIND (frame, - unify_create_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - } - return 0; - } - inode = local->loc1.inode; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, fd, - inode, &local->stbuf, - &local->oldpreparent, &local->oldpostparent); - } - return 0; -} - -/** - * unify_create_lookup_cbk - - */ -int32_t -unify_create_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *dict, - struct iatt *postparent) -{ - int32_t callcnt = 0; - int16_t index = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - priv->xl_array[(long)cookie]->name, - local->loc1.path, strerror (op_errno)); - local->op_errno = op_errno; - local->failed = 1; - } - - if (op_ret >= 0) { - local->op_ret = op_ret; - local->list[local->index++] = (int16_t)(long)cookie; - if (NS(this) == priv->xl_array[(long)cookie]) { - local->ia_ino = buf->ia_ino; - } else { - local->stbuf = *buf; - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - int16_t *list = local->list; - int16_t file_list[3] = {0,}; - local->op_ret = -1; - - local->list [local->index] = -1; - file_list[0] = list[0]; - file_list[1] = list[1]; - file_list[2] = -1; - - local->stbuf.ia_ino = local->ia_ino; - /* TODO: log on failure */ - inode_ctx_put (local->loc1.inode, this, - (uint64_t)(long)local->list); - - if (local->index != 2) { - /* Lookup failed, can't do open */ - gf_log (this->name, GF_LOG_ERROR, - "%s: present on %d nodes", - local->loc1.path, local->index); - file_list[0] = priv->child_count; - for (index = 0; list[index] != -1; index++) { - gf_log (this->name, GF_LOG_ERROR, - "%s: found on %s", local->loc1.path, - priv->xl_array[list[index]]->name); - if (list[index] != priv->child_count) - file_list[1] = list[index]; - } - - if (local->index < 2) { - unify_local_wipe (local); - gf_log (this->name, GF_LOG_ERROR, - "returning EIO as file found on " - "only one node"); - STACK_UNWIND (frame, -1, EIO, - local->fd, inode, NULL, - NULL, NULL); - return 0; - } - } - /* Everything is perfect :) */ - local->call_count = 2; - - for (index = 0; file_list[index] != -1; index++) { - char need_break = (file_list[index+1] == -1); - STACK_WIND_COOKIE (frame, - unify_create_open_cbk, - priv->xl_array[file_list[index]], - priv->xl_array[file_list[index]], - priv->xl_array[file_list[index]]->fops->open, - &local->loc1, - local->flags, - local->fd, 0); - if (need_break) - break; - } - } - - return 0; -} - - -/** - * unify_create_cbk - - */ -int32_t -unify_create_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - int ret = 0; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - inode_t *tmp_inode = NULL; - - if (op_ret == -1) { - /* send unlink () on Namespace */ - local->op_errno = op_errno; - local->op_ret = -1; - local->call_count = 1; - gf_log (this->name, GF_LOG_ERROR, - "create failed on %s (file %s, error %s), " - "sending unlink to namespace", - prev_frame->this->name, - local->loc1.path, strerror (op_errno)); - - STACK_WIND (frame, - unify_create_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - - return 0; - } - - if (op_ret >= 0) { - local->op_ret = op_ret; - local->stbuf = *buf; - /* Just inode number should be from NS node */ - local->stbuf.ia_ino = local->ia_ino; - - /* TODO: log on failure */ - ret = fd_ctx_set (fd, this, (uint64_t)(long)prev_frame->this); - } - - tmp_inode = local->loc1.inode; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, local->fd, - tmp_inode, &local->stbuf, - &local->oldpreparent, &local->oldpostparent); - - return 0; -} - -/** - * unify_ns_create_cbk - - * - */ -int32_t -unify_ns_create_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - struct sched_ops *sched_ops = NULL; - xlator_t *sched_xl = NULL; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t *list = NULL; - int16_t index = 0; - - if (op_ret == -1) { - /* No need to send create request to other servers, as - namespace action failed. Handle exclusive create here. */ - if ((op_errno != EEXIST) || - ((op_errno == EEXIST) && - ((local->flags & O_EXCL) == O_EXCL))) { - /* If its just a create call without O_EXCL, - don't do this */ - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s): %s", - local->loc1.path, strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf, - preparent, postparent); - return 0; - } - } - - if (op_ret >= 0) { - /* Get the inode number from the NS node */ - local->ia_ino = buf->ia_ino; - - local->oldpreparent = *preparent; - local->oldpostparent = *postparent; - - local->op_ret = -1; - - /* Start the mapping list */ - list = GF_CALLOC (1, sizeof (int16_t) * 3, - gf_unify_mt_int16_t); - ERR_ABORT (list); - inode_ctx_put (inode, this, (uint64_t)(long)list); - list[0] = priv->child_count; - list[2] = -1; - - /* This means, file doesn't exist anywhere in the Filesystem */ - sched_ops = priv->sched_ops; - - /* Send create request to the scheduled node now */ - sched_xl = sched_ops->schedule (this, local->loc1.path); - if (sched_xl == NULL) - { - /* send unlink () on Namespace */ - local->op_errno = ENOTCONN; - local->op_ret = -1; - local->call_count = 1; - gf_log (this->name, GF_LOG_ERROR, - "no node online to schedule create:(file %s) " - "sending unlink to namespace", - (local->loc1.path)?local->loc1.path:""); - - STACK_WIND (frame, - unify_create_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - - return 0; - } - - for (index = 0; index < priv->child_count; index++) - if (sched_xl == priv->xl_array[index]) - break; - list[1] = index; - - STACK_WIND (frame, unify_create_cbk, - sched_xl, sched_xl->fops->create, - &local->loc1, local->flags, local->mode, fd); - } else { - /* File already exists, and there is no O_EXCL flag */ - - gf_log (this->name, GF_LOG_DEBUG, - "File(%s) already exists on namespace, sending " - "open instead", local->loc1.path); - - local->list = GF_CALLOC (1, sizeof (int16_t) * 3, - gf_unify_mt_int16_t); - ERR_ABORT (local->list); - local->call_count = priv->child_count + 1; - local->op_ret = -1; - for (index = 0; index <= priv->child_count; index++) { - /* Send lookup() to all nodes including namespace */ - STACK_WIND_COOKIE (frame, - unify_create_lookup_cbk, - (void *)(long)index, - priv->xl_array[index], - priv->xl_array[index]->fops->lookup, - &local->loc1, - NULL); - } - } - return 0; -} - -/** - * unify_create - create a file in global namespace first, so other - * clients can see them. Create the file in storage nodes in background. - */ -int32_t -unify_create (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, - mode_t mode, - fd_t *fd) -{ - unify_local_t *local = NULL; - - /* Initialization */ - INIT_LOCAL (frame, local); - local->mode = mode; - local->flags = flags; - local->fd = fd; - - loc_copy (&local->loc1, loc); - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, fd, loc->inode, NULL, - NULL, NULL); - return 0; - } - - STACK_WIND (frame, - unify_ns_create_cbk, - NS(this), - NS(this)->fops->create, - loc, - flags | O_EXCL, - mode, - fd); - - return 0; -} - - -/** - * unify_opendir_cbk - - */ -int32_t -unify_opendir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - STACK_UNWIND (frame, op_ret, op_errno, fd); - - return 0; -} - -/** - * unify_opendir - - */ -int32_t -unify_opendir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - fd_t *fd) -{ - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - STACK_WIND (frame, unify_opendir_cbk, - NS(this), NS(this)->fops->opendir, loc, fd); - - return 0; -} - - -int32_t -unify_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *statpre, - struct iatt *statpost) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "%s(): child(%s): path(%s): %s", - gf_fop_list[frame->root->op], - prev_frame->this->name, - (local->loc1.path)?local->loc1.path:"", - strerror (op_errno)); - - local->op_errno = op_errno; - if ((op_errno == ENOENT) && priv->optimist) - local->op_ret = 0; - } - - if (op_ret >= 0) { - local->op_ret = 0; - - if (NS (this) == prev_frame->this) { - local->ia_ino = statpost->ia_ino; - /* If the entry is directory, get the stat - from NS node */ - if (IA_ISDIR (statpost->ia_type) || - !local->stpost.ia_blksize) { - local->stpre = *statpre; - local->stpost = *statpost; - } - } - - if ((!IA_ISDIR (statpost->ia_type)) && - (NS (this) != prev_frame->this)) { - /* If file, take the stat info from Storage - node. */ - local->stpre = *statpre; - local->stpost = *statpost; - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - /* If the inode number is not filled, operation should - fail */ - if (!local->ia_ino) - local->op_ret = -1; - - local->stpre.ia_ino = local->ia_ino; - local->stpost.ia_ino = local->ia_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stpre, &local->stpost); - } - - return 0; -} - - -int32_t -unify_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid) -{ - unify_local_t *local = NULL; - unify_private_t *priv = this->private; - int32_t index = 0; - int32_t callcnt = 0; - uint64_t tmp_list = 0; - - if (!(loc && loc->inode)) { - STACK_UNWIND (frame, -1, EINVAL, NULL, NULL); - return 0; - } - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - - if (IA_ISDIR (loc->inode->ia_type)) { - local->call_count = 1; - - STACK_WIND (frame, - unify_setattr_cbk, - NS (this), - NS (this)->fops->setattr, - loc, stbuf, valid); - } else { - inode_ctx_get (loc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - - for (index = 0; local->list[index] != -1; index++) { - local->call_count++; - callcnt++; - } - - for (index = 0; local->list[index] != -1; index++) { - STACK_WIND (frame, - unify_setattr_cbk, - priv->xl_array[local->list[index]], - priv->xl_array[local->list[index]]->fops->setattr, - loc, stbuf, valid); - - if (!--callcnt) - break; - } - } - - return 0; -} - - -int32_t -unify_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid) -{ - unify_local_t *local = NULL; - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd); - - /* Initialization */ - INIT_LOCAL (frame, local); - - if (!fd_ctx_get (fd, this, &tmp_child)) { - /* If its set, then its file */ - child = (xlator_t *)(long)tmp_child; - - local->call_count = 2; - - STACK_WIND (frame, unify_setattr_cbk, child, - child->fops->fsetattr, fd, stbuf, valid); - - STACK_WIND (frame, unify_setattr_cbk, NS(this), - NS(this)->fops->fsetattr, fd, stbuf, valid); - } else { - local->call_count = 1; - - STACK_WIND (frame, unify_setattr_cbk, - NS(this), NS(this)->fops->fsetattr, - fd, stbuf, valid); - } - - return 0; -} - - -/** - * unify_truncate_cbk - - */ -int32_t -unify_truncate_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *prebuf, - struct iatt *postbuf) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - prev_frame->this->name, - (local->loc1.path)?local->loc1.path:"", - strerror (op_errno)); - local->op_errno = op_errno; - if (!((op_errno == ENOENT) && priv->optimist)) - local->op_ret = -1; - } - - if (op_ret >= 0) { - if (NS (this) == prev_frame->this) { - local->ia_ino = postbuf->ia_ino; - /* If the entry is directory, get the - stat from NS node */ - if (IA_ISDIR (postbuf->ia_type) || - !local->stbuf.ia_blksize) { - local->stbuf = *prebuf; - local->poststbuf = *postbuf; - } - } - - if ((!IA_ISDIR (postbuf->ia_type)) && - (NS (this) != prev_frame->this)) { - /* If file, take the stat info from - Storage node. */ - local->stbuf = *prebuf; - local->poststbuf = *postbuf; - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->ia_ino) { - local->stbuf.ia_ino = local->ia_ino; - local->poststbuf.ia_ino = local->ia_ino; - } else { - local->op_ret = -1; - } - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stbuf, &local->poststbuf); - } - - return 0; -} - - -/** - * unify_truncate - - */ -int32_t -unify_truncate (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - off_t offset) -{ - unify_local_t *local = NULL; - unify_private_t *priv = this->private; - int32_t index = 0; - int32_t callcnt = 0; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - local->ia_ino = loc->inode->ino; - - if (IA_ISDIR (loc->inode->ia_type)) { - local->call_count = 1; - - STACK_WIND (frame, - unify_truncate_cbk, - NS(this), - NS(this)->fops->truncate, - loc, - 0); - } else { - local->op_ret = 0; - inode_ctx_get (loc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - - for (index = 0; local->list[index] != -1; index++) { - local->call_count++; - callcnt++; - } - - /* Don't send offset to NS truncate */ - STACK_WIND (frame, unify_truncate_cbk, NS(this), - NS(this)->fops->truncate, loc, 0); - callcnt--; - - for (index = 0; local->list[index] != -1; index++) { - if (NS(this) != priv->xl_array[local->list[index]]) { - STACK_WIND (frame, - unify_truncate_cbk, - priv->xl_array[local->list[index]], - priv->xl_array[local->list[index]]->fops->truncate, - loc, - offset); - if (!--callcnt) - break; - } - } - } - - return 0; -} - -/** - * unify_readlink_cbk - - */ -int32_t -unify_readlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - const char *path, - struct iatt *sbuf) -{ - STACK_UNWIND (frame, op_ret, op_errno, path, sbuf); - return 0; -} - -/** - * unify_readlink - Read the link only from the storage node. - */ -int32_t -unify_readlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - size_t size) -{ - unify_private_t *priv = this->private; - int32_t entry_count = 0; - int16_t *list = NULL; - int16_t index = 0; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) - entry_count++; - - if (entry_count >= 2) { - for (index = 0; list[index] != -1; index++) { - if (priv->xl_array[list[index]] != NS(this)) { - STACK_WIND (frame, - unify_readlink_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->readlink, - loc, - size); - break; - } - } - } else { - gf_log (this->name, GF_LOG_ERROR, - "returning ENOENT, no softlink files found " - "on storage node"); - STACK_UNWIND (frame, -1, ENOENT, NULL); - } - - return 0; -} - - -/** - * unify_unlink_cbk - - */ -int32_t -unify_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret == 0 || ((op_errno == ENOENT) && priv->optimist)) - local->op_ret = 0; - if (op_ret == -1) - local->op_errno = op_errno; - - if (((call_frame_t *)cookie)->this == NS(this)) { - local->oldpreparent = *preparent; - local->oldpostparent = *postparent; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->oldpreparent, &local->oldpostparent); - } - - return 0; -} - - -/** - * unify_unlink - - */ -int32_t -unify_unlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - unify_private_t *priv = this->private; - unify_local_t *local = NULL; - int16_t *list = NULL; - int16_t index = 0; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) - local->call_count++; - - if (local->call_count) { - for (index = 0; list[index] != -1; index++) { - char need_break = (list[index+1] == -1); - STACK_WIND (frame, - unify_unlink_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->unlink, - loc); - if (need_break) - break; - } - } else { - gf_log (this->name, GF_LOG_ERROR, - "%s: returning ENOENT", loc->path); - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - } - - return 0; -} - - -/** - * unify_readv_cbk - - */ -int32_t -unify_readv_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iovec *vector, - int32_t count, - struct iatt *stbuf, - struct iobref *iobref) -{ - STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf, iobref); - return 0; -} - -/** - * unify_readv - - */ -int32_t -unify_readv (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t offset) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, - unify_readv_cbk, - child, - child->fops->readv, - fd, - size, - offset); - - - return 0; -} - -/** - * unify_writev_cbk - - */ -int32_t -unify_writev_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *prebuf, - struct iatt *postbuf) -{ - unify_local_t *local = NULL; - - local = frame->local; - - local->stbuf = *prebuf; - local->stbuf.ia_ino = local->ia_ino; - - local->poststbuf = *postbuf; - local->poststbuf.ia_ino = local->ia_ino; - - STACK_UNWIND (frame, op_ret, op_errno, - &local->stbuf, &local->poststbuf); - return 0; -} - -/** - * unify_writev - - */ -int32_t -unify_writev (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct iovec *vector, - int32_t count, - off_t off, - struct iobref *iobref) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - unify_local_t *local = NULL; - - INIT_LOCAL (frame, local); - local->ia_ino = fd->inode->ino; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, - unify_writev_cbk, - child, - child->fops->writev, - fd, - vector, - count, - off, - iobref); - - return 0; -} - -/** - * unify_ftruncate - - */ -int32_t -unify_ftruncate (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - off_t offset) -{ - xlator_t *child = NULL; - unify_local_t *local = NULL; - uint64_t tmp_child = 0; - - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR(fd); - - /* Initialization */ - INIT_LOCAL (frame, local); - local->op_ret = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - local->call_count = 2; - - STACK_WIND (frame, unify_truncate_cbk, - child, child->fops->ftruncate, - fd, offset); - - STACK_WIND (frame, unify_truncate_cbk, - NS(this), NS(this)->fops->ftruncate, - fd, 0); - - return 0; -} - - -/** - * unify_flush_cbk - - */ -int32_t -unify_flush_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -/** - * unify_flush - - */ -int32_t -unify_flush (call_frame_t *frame, - xlator_t *this, - fd_t *fd) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_flush_cbk, child, - child->fops->flush, fd); - - return 0; -} - - -/** - * unify_fsync_cbk - - */ -int32_t -unify_fsync_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *prebuf, - struct iatt *postbuf) -{ - STACK_UNWIND (frame, op_ret, op_errno, prebuf, postbuf); - return 0; -} - -/** - * unify_fsync - - */ -int32_t -unify_fsync (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_fsync_cbk, child, - child->fops->fsync, fd, flags); - - return 0; -} - -/** - * unify_fstat - Send fstat FOP to Namespace only if its directory, and to - * both namespace and the storage node if its a file. - */ -int32_t -unify_fstat (call_frame_t *frame, - xlator_t *this, - fd_t *fd) -{ - unify_local_t *local = NULL; - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd); - - INIT_LOCAL (frame, local); - local->ia_ino = fd->inode->ino; - - if (!fd_ctx_get (fd, this, &tmp_child)) { - /* If its set, then its file */ - child = (xlator_t *)(long)tmp_child; - local->call_count = 2; - - STACK_WIND (frame, unify_buf_cbk, child, - child->fops->fstat, fd); - - STACK_WIND (frame, unify_buf_cbk, NS(this), - NS(this)->fops->fstat, fd); - - } else { - /* this is an directory */ - local->call_count = 1; - STACK_WIND (frame, unify_buf_cbk, NS(this), - NS(this)->fops->fstat, fd); - } - - return 0; -} - -/** - * unify_getdents_cbk - - */ -int32_t -unify_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count) -{ - STACK_UNWIND (frame, op_ret, op_errno, entry, count); - return 0; -} - -/** - * unify_getdents - send the FOP request to all the nodes. - */ -int32_t -unify_getdents (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t offset, - int32_t flag) -{ - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd); - - STACK_WIND (frame, unify_getdents_cbk, NS(this), - NS(this)->fops->getdents, fd, size, offset, flag); - - return 0; -} - - -/** - * unify_readdir_cbk - - */ -int32_t -unify_readdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - gf_dirent_t *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - - return 0; -} - -/** - * unify_readdir - send the FOP request to all the nodes. - */ -int32_t -unify_readdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t offset) -{ - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd); - - STACK_WIND (frame, unify_readdir_cbk, NS(this), - NS(this)->fops->readdir, fd, size, offset); - - return 0; -} - - -int32_t -unify_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - - return 0; -} - - -int32_t -unify_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) -{ - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd); - - STACK_WIND (frame, unify_readdirp_cbk, NS(this), - NS(this)->fops->readdirp, fd, size, offset); - - return 0; -} - - -/** - * unify_fsyncdir_cbk - - */ -int32_t -unify_fsyncdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - - return 0; -} - -/** - * unify_fsyncdir - - */ -int32_t -unify_fsyncdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags) -{ - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd); - - STACK_WIND (frame, unify_fsyncdir_cbk, - NS(this), NS(this)->fops->fsyncdir, fd, flags); - - return 0; -} - -/** - * unify_lk_cbk - UNWIND frame with the proper return arguments. - */ -int32_t -unify_lk_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct gf_flock *lock) -{ - STACK_UNWIND (frame, op_ret, op_errno, lock); - return 0; -} - -/** - * unify_lk - Send it to all the storage nodes, (should be 1) which has file. - */ -int32_t -unify_lk (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t cmd, - struct gf_flock *lock) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_lk_cbk, child, - child->fops->lk, fd, cmd, lock); - - return 0; -} - - -int32_t -unify_setxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno); - -static int32_t -unify_setxattr_file_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - unify_private_t *private = this->private; - unify_local_t *local = frame->local; - xlator_t *sched_xl = NULL; - struct sched_ops *sched_ops = NULL; - - if (op_ret == -1) { - if (!ENOTSUP) - gf_log (this->name, GF_LOG_ERROR, - "setxattr with XATTR_CREATE on ns: " - "path(%s) key(%s): %s", - local->loc1.path, local->name, - strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno); - return 0; - } - - LOCK (&frame->lock); - { - local->failed = 0; - local->op_ret = 0; - local->op_errno = 0; - local->call_count = 1; - } - UNLOCK (&frame->lock); - - /* schedule XATTR_CREATE on one of the child node */ - sched_ops = private->sched_ops; - - /* Send create request to the scheduled node now */ - sched_xl = sched_ops->schedule (this, local->name); - if (!sched_xl) { - STACK_UNWIND (frame, -1, ENOTCONN); - return 0; - } - - STACK_WIND (frame, - unify_setxattr_cbk, - sched_xl, - sched_xl->fops->setxattr, - &local->loc1, - local->dict, - local->flags); - return 0; -} - -/** - * unify_setxattr_cbk - When all the child nodes return, UNWIND frame. - */ -int32_t -unify_setxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = 0; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - dict_t *dict = NULL; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, (((op_errno == ENOENT) || - (op_errno == ENOTSUP))? - GF_LOG_DEBUG : GF_LOG_ERROR), - "child(%s): path(%s): %s", - prev_frame->this->name, - (local->loc1.path)?local->loc1.path:"", - strerror (op_errno)); - if (local->failed == -1) { - local->failed = 1; - } - local->op_errno = op_errno; - } else { - local->failed = 0; - local->op_ret = op_ret; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed && local->name && - ZR_FILE_CONTENT_REQUEST(local->name)) { - dict = get_new_dict (); - dict_set (dict, local->dict->members_list->key, - data_from_dynptr(NULL, 0)); - dict_ref (dict); - - local->call_count = 1; - - STACK_WIND (frame, - unify_setxattr_file_cbk, - NS(this), - NS(this)->fops->setxattr, - &local->loc1, - dict, - XATTR_CREATE); - - dict_unref (dict); - return 0; - } - - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno); - } - - return 0; -} - -/** - * unify_sexattr - This function should be sent to all the storage nodes, - * which contains the file, (excluding namespace). - */ -int32_t -unify_setxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *dict, - int32_t flags) -{ - unify_private_t *priv = this->private; - unify_local_t *local = NULL; - int16_t *list = NULL; - int16_t index = 0; - int32_t call_count = 0; - uint64_t tmp_list = 0; - data_pair_t *trav = dict->members_list; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - local->failed = -1; - loc_copy (&local->loc1, loc); - - if (IA_ISDIR (loc->inode->ia_type)) { - - if (trav && trav->key && ZR_FILE_CONTENT_REQUEST(trav->key)) { - /* direct the storage xlators to change file - content only if file exists */ - local->flags = flags; - local->dict = dict; - local->name = gf_strdup (trav->key); - flags |= XATTR_REPLACE; - } - - local->call_count = priv->child_count; - for (index = 0; index < priv->child_count; index++) { - STACK_WIND (frame, - unify_setxattr_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->setxattr, - loc, dict, flags); - } - return 0; - } - - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) { - if (NS(this) != priv->xl_array[list[index]]) { - local->call_count++; - call_count++; - } - } - - if (local->call_count) { - for (index = 0; list[index] != -1; index++) { - if (priv->xl_array[list[index]] != NS(this)) { - STACK_WIND (frame, - unify_setxattr_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->setxattr, - loc, - dict, - flags); - if (!--call_count) - break; - } - } - return 0; - } - - /* No entry in storage nodes */ - gf_log (this->name, GF_LOG_DEBUG, - "returning ENOENT, file not found on storage node."); - STACK_UNWIND (frame, -1, ENOENT); - - return 0; -} - - -/** - * unify_getxattr_cbk - This function is called from only one child, so, no - * need of any lock or anything else, just send it to above layer - */ -int32_t -unify_getxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dict_t *value) -{ - int32_t callcnt = 0; - dict_t *local_value = NULL; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, - (((op_errno == ENOENT) || - (op_errno == ENODATA) || - (op_errno == ENOTSUP)) ? - GF_LOG_DEBUG : GF_LOG_ERROR), - "child(%s): path(%s): %s", - prev_frame->this->name, - (local->loc1.path)?local->loc1.path:"", - strerror (op_errno)); - } else { - if (!local->dict) - local->dict = dict_ref (value); - local->op_ret = op_ret; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local_value = local->dict; - local->dict = NULL; - - STACK_UNWIND (frame, local->op_ret, local->op_errno, - local_value); - - if (local_value) - dict_unref (local_value); - } - - return 0; -} - - -/** - * unify_getxattr - This FOP is sent to only the storage node. - */ -int32_t -unify_getxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name) -{ - unify_private_t *priv = this->private; - int16_t *list = NULL; - int16_t index = 0; - int16_t count = 0; - unify_local_t *local = NULL; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - INIT_LOCAL (frame, local); - - if (IA_ISDIR (loc->inode->ia_type)) { - local->call_count = priv->child_count; - for (index = 0; index < priv->child_count; index++) - STACK_WIND (frame, - unify_getxattr_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->getxattr, - loc, - name); - return 0; - } - - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) { - if (NS(this) != priv->xl_array[list[index]]) { - local->call_count++; - count++; - } - } - - if (count) { - for (index = 0; list[index] != -1; index++) { - if (priv->xl_array[list[index]] != NS(this)) { - STACK_WIND (frame, - unify_getxattr_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->getxattr, - loc, - name); - if (!--count) - break; - } - } - } else { - dict_t *tmp_dict = get_new_dict (); - gf_log (this->name, GF_LOG_DEBUG, - "%s: returning ENODATA, no file found on storage node", - loc->path); - STACK_UNWIND (frame, -1, ENODATA, tmp_dict); - dict_destroy (tmp_dict); - } - - return 0; -} - -/** - * unify_removexattr_cbk - Wait till all the child node returns the call - * and then UNWIND to above layer. - */ -int32_t -unify_removexattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = 0; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret == -1) { - local->op_errno = op_errno; - if (op_errno != ENOTSUP) - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - prev_frame->this->name, - local->loc1.path, strerror (op_errno)); - } else { - local->op_ret = op_ret; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - STACK_UNWIND (frame, local->op_ret, local->op_errno); - } - - return 0; -} - -/** - * unify_removexattr - Send it to all the child nodes which has the files. - */ -int32_t -unify_removexattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name) -{ - unify_private_t *priv = this->private; - unify_local_t *local = NULL; - int16_t *list = NULL; - int16_t index = 0; - int32_t call_count = 0; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - - if (IA_ISDIR (loc->inode->ia_type)) { - local->call_count = priv->child_count; - for (index = 0; index < priv->child_count; index++) - STACK_WIND (frame, - unify_removexattr_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->removexattr, - loc, - name); - - return 0; - } - - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) { - if (NS(this) != priv->xl_array[list[index]]) { - local->call_count++; - call_count++; - } - } - - if (local->call_count) { - for (index = 0; list[index] != -1; index++) { - if (priv->xl_array[list[index]] != NS(this)) { - STACK_WIND (frame, - unify_removexattr_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->removexattr, - loc, - name); - if (!--call_count) - break; - } - } - return 0; - } - - gf_log (this->name, GF_LOG_DEBUG, - "%s: returning ENOENT, not found on storage node.", loc->path); - STACK_UNWIND (frame, -1, ENOENT); - - return 0; -} - - -int32_t -unify_mknod_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_local_t *local = frame->local; - - if (op_ret == -1) - gf_log (this->name, GF_LOG_ERROR, - "%s: %s", local->loc1.path, strerror (op_errno)); - - unify_local_wipe (local); - /* No log required here as this -1 is for mknod call */ - STACK_UNWIND (frame, -1, local->op_errno, NULL, NULL); - return 0; -} - -/** - * unify_mknod_cbk - - */ -int32_t -unify_mknod_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_local_t *local = frame->local; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "mknod failed on storage node, sending unlink to " - "namespace"); - local->op_errno = op_errno; - STACK_WIND (frame, - unify_mknod_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - return 0; - } - - local->stbuf = *buf; - local->stbuf.ia_ino = local->ia_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf, - &local->oldpreparent, &local->oldpostparent); - return 0; -} - -/** - * unify_ns_mknod_cbk - - */ -int32_t -unify_ns_mknod_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - struct sched_ops *sched_ops = NULL; - xlator_t *sched_xl = NULL; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t *list = NULL; - int16_t index = 0; - call_frame_t *prev_frame = cookie; - - if (op_ret == -1) { - /* No need to send mknod request to other servers, - * as namespace action failed - */ - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - prev_frame->this->name, local->loc1.path, - strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, buf, - preparent, postparent); - return 0; - } - - /* Create one inode for this entry */ - local->op_ret = 0; - local->stbuf = *buf; - local->ia_ino = buf->ia_ino; - - local->oldpreparent = *preparent; - local->oldpostparent = *postparent; - - list = GF_CALLOC (1, sizeof (int16_t) * 3, gf_unify_mt_int16_t); - ERR_ABORT (list); - list[0] = priv->child_count; - list[2] = -1; - inode_ctx_put (inode, this, (uint64_t)(long)list); - - sched_ops = priv->sched_ops; - - /* Send mknod request to scheduled node now */ - sched_xl = sched_ops->schedule (this, local->loc1.path); - if (!sched_xl) { - gf_log (this->name, GF_LOG_ERROR, - "mknod failed on storage node, no node online " - "at the moment, sending unlink to NS"); - local->op_errno = ENOTCONN; - STACK_WIND (frame, - unify_mknod_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - - return 0; - } - - for (index = 0; index < priv->child_count; index++) - if (sched_xl == priv->xl_array[index]) - break; - list[1] = index; - - STACK_WIND (frame, unify_mknod_cbk, - sched_xl, sched_xl->fops->mknod, - &local->loc1, local->mode, local->dev); - - return 0; -} - -/** - * unify_mknod - Create a device on namespace first, and later create on - * the storage node. - */ -int32_t -unify_mknod (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode, - dev_t rdev) -{ - unify_local_t *local = NULL; - - /* Initialization */ - INIT_LOCAL (frame, local); - local->mode = mode; - local->dev = rdev; - loc_copy (&local->loc1, loc); - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL); - return 0; - } - - STACK_WIND (frame, - unify_ns_mknod_cbk, - NS(this), - NS(this)->fops->mknod, - loc, - mode, - rdev); - - return 0; -} - -int32_t -unify_symlink_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_local_t *local = frame->local; - if (op_ret == -1) - gf_log (this->name, GF_LOG_ERROR, - "%s: %s", local->loc1.path, strerror (op_errno)); - - unify_local_wipe (local); - STACK_UNWIND (frame, -1, local->op_errno, NULL, NULL); - return 0; -} - -/** - * unify_symlink_cbk - - */ -int32_t -unify_symlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_local_t *local = frame->local; - - if (op_ret == -1) { - /* Symlink on storage node failed, hence send unlink - to the NS node */ - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_ERROR, - "symlink on storage node failed, sending unlink " - "to namespace"); - - STACK_WIND (frame, - unify_symlink_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - - return 0; - } - - local->stbuf = *buf; - local->stbuf.ia_ino = local->ia_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf, - &local->oldpreparent, &local->oldpostparent); - - return 0; -} - -/** - * unify_ns_symlink_cbk - - */ -int32_t -unify_ns_symlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - - struct sched_ops *sched_ops = NULL; - xlator_t *sched_xl = NULL; - int16_t *list = NULL; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t index = 0; - - if (op_ret == -1) { - /* No need to send symlink request to other servers, - * as namespace action failed - */ - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s): %s", - local->loc1.path, strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, NULL, buf, - preparent, postparent); - return 0; - } - - /* Create one inode for this entry */ - local->op_ret = 0; - local->ia_ino = buf->ia_ino; - - local->oldpreparent = *preparent; - local->oldpostparent = *postparent; - - /* Start the mapping list */ - - list = GF_CALLOC (1, sizeof (int16_t) * 3, gf_unify_mt_int16_t); - ERR_ABORT (list); - list[0] = priv->child_count; //namespace's index - list[2] = -1; - inode_ctx_put (inode, this, (uint64_t)(long)list); - - sched_ops = priv->sched_ops; - - /* Send symlink request to all the nodes now */ - sched_xl = sched_ops->schedule (this, local->loc1.path); - if (!sched_xl) { - /* Symlink on storage node failed, hence send unlink - to the NS node */ - local->op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_ERROR, - "symlink on storage node failed, no node online, " - "sending unlink to namespace"); - - STACK_WIND (frame, - unify_symlink_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - - return 0; - } - - for (index = 0; index < priv->child_count; index++) - if (sched_xl == priv->xl_array[index]) - break; - list[1] = index; - - STACK_WIND (frame, - unify_symlink_cbk, - sched_xl, - sched_xl->fops->symlink, - local->name, - &local->loc1); - - return 0; -} - -/** - * unify_symlink - - */ -int32_t -unify_symlink (call_frame_t *frame, - xlator_t *this, - const char *linkpath, - loc_t *loc) -{ - unify_local_t *local = NULL; - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - local->name = gf_strdup (linkpath); - - if ((local->name == NULL) || - (local->loc1.path == NULL)) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL); - return 0; - } - - STACK_WIND (frame, - unify_ns_symlink_cbk, - NS(this), - NS(this)->fops->symlink, - linkpath, - loc); - - return 0; -} - - -int32_t -unify_rename_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - int32_t callcnt = 0; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s -> %s): %s", - prev_frame->this->name, - local->loc1.path, local->loc2.path, - strerror (op_errno)); - - } - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local->stbuf.ia_ino = local->ia_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stbuf); - } - return 0; -} - -int32_t -unify_ns_rename_undo_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf, - struct iatt *preoldparent, - struct iatt *postoldparent, - struct iatt *prenewparent, - struct iatt *postnewparent) -{ - unify_local_t *local = frame->local; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s -> %s): %s", - local->loc1.path, local->loc2.path, - strerror (op_errno)); - } - - local->stbuf.ia_ino = local->ia_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, &local->stbuf); - return 0; -} - -int32_t -unify_rename_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf, - struct iatt *preoldparent, - struct iatt *postoldparent, - struct iatt *prenewparent, - struct iatt *postnewparent) -{ - int32_t index = 0; - int32_t callcnt = 0; - int16_t *list = NULL; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret >= 0) { - if (!IA_ISDIR (buf->ia_type)) - local->stbuf = *buf; - local->op_ret = op_ret; - } else { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s -> %s): %s", - prev_frame->this->name, - local->loc1.path, local->loc2.path, - strerror (op_errno)); - local->op_errno = op_errno; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local->stbuf.ia_ino = local->ia_ino; - if (IA_ISDIR (local->loc1.inode->ia_type)) { - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stbuf, &local->oldpreparent, - &local->oldpostparent, &local->newpreparent, - &local->newpostparent); - return 0; - } - - if (local->op_ret == -1) { - /* TODO: check this logic */ - - /* Rename failed in storage node, successful on NS, - * hence, rename back the entries in NS */ - /* NOTE: this will be done only if the destination - * doesn't exists, if the destination exists, the - * job of correcting NS is left to self-heal - */ - if (!local->index) { - loc_t tmp_oldloc = { - /* its actual 'newloc->path' */ - .path = local->loc2.path, - .inode = local->loc1.inode, - .parent = local->loc2.parent - }; - - loc_t tmp_newloc = { - /* Actual 'oldloc->path' */ - .path = local->loc1.path, - .parent = local->loc1.parent - }; - - gf_log (this->name, GF_LOG_ERROR, - "rename succussful on namespace, on " - "stroage node failed, reverting back"); - - STACK_WIND (frame, - unify_ns_rename_undo_cbk, - NS(this), - NS(this)->fops->rename, - &tmp_oldloc, - &tmp_newloc); - return 0; - } - } else { - /* Rename successful on storage nodes */ - - int32_t idx = 0; - int16_t *tmp_list = NULL; - uint64_t tmp_list_int64 = 0; - if (local->loc2.inode) { - inode_ctx_get (local->loc2.inode, - this, &tmp_list_int64); - list = (int16_t *)(long)tmp_list_int64; - - } - - if (list) { - for (index = 0; list[index] != -1; index++); - tmp_list = GF_CALLOC (1, index * 2, - gf_unify_mt_int16_t); - memcpy (tmp_list, list, index * 2); - - for (index = 0; list[index] != -1; index++) { - /* TODO: Check this logic. */ - /* If the destination file exists in - * the same storage node where we sent - * 'rename' call, no need to send - * unlink - */ - for (idx = 0; - local->list[idx] != -1; idx++) { - if (tmp_list[index] == local->list[idx]) { - tmp_list[index] = priv->child_count; - continue; - } - } - - if (NS(this) != priv->xl_array[tmp_list[index]]) { - local->call_count++; - callcnt++; - } - } - - if (local->call_count) { - if (callcnt > 1) - gf_log (this->name, - GF_LOG_ERROR, - "%s->%s: more (%d) " - "subvolumes have the " - "newloc entry", - local->loc1.path, - local->loc2.path, - callcnt); - - for (index=0; - tmp_list[index] != -1; index++) { - if (NS(this) != priv->xl_array[tmp_list[index]]) { - STACK_WIND (frame, - unify_rename_unlink_cbk, - priv->xl_array[tmp_list[index]], - priv->xl_array[tmp_list[index]]->fops->unlink, - &local->loc2); - if (!--callcnt) - break; - } - } - - GF_FREE (tmp_list); - return 0; - } - if (tmp_list) - GF_FREE (tmp_list); - } - } - - /* Need not send 'unlink' to storage node */ - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, - local->op_errno, &local->stbuf, - &local->oldpreparent, &local->oldpostparent, - &local->newpreparent, &local->newpostparent); - } - - return 0; -} - -int32_t -unify_ns_rename_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf, - struct iatt *preoldparent, - struct iatt *postoldparent, - struct iatt *prenewparent, - struct iatt *postnewparent) -{ - int32_t index = 0; - int32_t callcnt = 0; - int16_t *list = NULL; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - if (op_ret == -1) { - /* Free local->new_inode */ - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s -> %s): %s", - local->loc1.path, local->loc2.path, - strerror (op_errno)); - - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, buf, - preoldparent, postoldparent, - prenewparent, postnewparent); - return 0; - } - - local->stbuf = *buf; - local->ia_ino = buf->ia_ino; - - local->oldpreparent = *preoldparent; - local->oldpostparent = *postoldparent; - local->newpreparent = *prenewparent; - local->newpostparent = *postnewparent; - - /* Everything is fine. */ - if (IA_ISDIR (buf->ia_type)) { - local->call_count = priv->child_count; - for (index=0; index < priv->child_count; index++) { - STACK_WIND (frame, - unify_rename_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->rename, - &local->loc1, - &local->loc2); - } - - return 0; - } - - local->call_count = 0; - /* send rename */ - list = local->list; - for (index=0; list[index] != -1; index++) { - if (NS(this) != priv->xl_array[list[index]]) { - local->call_count++; - callcnt++; - } - } - - if (local->call_count) { - for (index=0; list[index] != -1; index++) { - if (NS(this) != priv->xl_array[list[index]]) { - STACK_WIND (frame, - unify_rename_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->rename, - &local->loc1, - &local->loc2); - if (!--callcnt) - break; - } - } - } else { - /* file doesn't seem to be present in storage nodes */ - gf_log (this->name, GF_LOG_CRITICAL, - "CRITICAL: source file not in storage node, " - "rename successful on namespace :O"); - unify_local_wipe (local); - STACK_UNWIND (frame, -1, EIO, NULL, - NULL, NULL, /* preoldparent, postoldparent */ - NULL, NULL); /* prenewparent, postnewparent */ - } - return 0; -} - - -/** - * unify_rename - One of the tricky function. The deadliest of all :O - */ -int32_t -unify_rename (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - unify_local_t *local = NULL; - uint64_t tmp_list = 0; - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, oldloc); - loc_copy (&local->loc2, newloc); - - if ((local->loc1.path == NULL) || - (local->loc2.path == NULL)) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, NULL, - NULL, NULL, /* preoldparent, postoldparent */ - NULL, NULL); /* prenewparent, postnewparent */ - return 0; - } - - inode_ctx_get (oldloc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - - STACK_WIND (frame, - unify_ns_rename_cbk, - NS(this), - NS(this)->fops->rename, - oldloc, - newloc); - return 0; -} - -/** - * unify_link_cbk - - */ -int32_t -unify_link_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_local_t *local = frame->local; - - if (op_ret >= 0) - local->stbuf = *buf; - local->stbuf.ia_ino = local->ia_ino; - - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf, - &local->oldpreparent, &local->oldpostparent); - - return 0; -} - -/** - * unify_ns_link_cbk - - */ -int32_t -unify_ns_link_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - int16_t *list = local->list; - int16_t index = 0; - - if (op_ret == -1) { - /* No need to send link request to other servers, - * as namespace action failed - */ - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s -> %s): %s", - local->loc1.path, local->loc2.path, - strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, buf, - preparent, postparent); - return 0; - } - - /* Update inode for this entry */ - local->op_ret = 0; - local->ia_ino = buf->ia_ino; - - local->oldpreparent = *preparent; - local->oldpostparent = *postparent; - - /* Send link request to the node now */ - for (index = 0; list[index] != -1; index++) { - char need_break = (list[index+1] == -1); - if (priv->xl_array[list[index]] != NS (this)) { - STACK_WIND (frame, - unify_link_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->link, - &local->loc1, - &local->loc2); - break; - } - if (need_break) - break; - } - - return 0; -} - -/** - * unify_link - - */ -int32_t -unify_link (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - unify_local_t *local = NULL; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (oldloc); - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (newloc); - - /* Initialization */ - INIT_LOCAL (frame, local); - - loc_copy (&local->loc1, oldloc); - loc_copy (&local->loc2, newloc); - - inode_ctx_get (oldloc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - - STACK_WIND (frame, - unify_ns_link_cbk, - NS(this), - NS(this)->fops->link, - oldloc, - newloc); - - return 0; -} - - -/** - * unify_checksum_cbk - - */ -int32_t -unify_checksum_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - uint8_t *fchecksum, - uint8_t *dchecksum) -{ - STACK_UNWIND (frame, op_ret, op_errno, fchecksum, dchecksum); - - return 0; -} - -/** - * unify_checksum - - */ -int32_t -unify_checksum (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flag) -{ - STACK_WIND (frame, - unify_checksum_cbk, - NS(this), - NS(this)->fops->checksum, - loc, - flag); - - return 0; -} - - -/** - * unify_finodelk_cbk - - */ -int -unify_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -/** - * unify_finodelk - */ -int -unify_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int cmd, struct gf_flock *flock) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_finodelk_cbk, - child, child->fops->finodelk, - volume, fd, cmd, flock); - - return 0; -} - - - -/** - * unify_fentrylk_cbk - - */ -int -unify_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -/** - * unify_fentrylk - */ -int -unify_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type) - -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_fentrylk_cbk, - child, child->fops->fentrylk, - volume, fd, basename, cmd, type); - - return 0; -} - - - -/** - * unify_fxattrop_cbk - - */ -int -unify_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) -{ - STACK_UNWIND (frame, op_ret, op_errno, xattr); - return 0; -} - -/** - * unify_fxattrop - */ -int -unify_fxattrop (call_frame_t *frame, xlator_t *this, - fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_fxattrop_cbk, - child, child->fops->fxattrop, - fd, optype, xattr); - - return 0; -} - - -/** - * unify_inodelk_cbk - - */ -int -unify_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - - -/** - * unify_inodelk - */ -int -unify_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int cmd, struct gf_flock *flock) -{ - xlator_t *child = NULL; - - child = unify_loc_subvol (loc, this); - - STACK_WIND (frame, unify_inodelk_cbk, - child, child->fops->inodelk, - volume, loc, cmd, flock); - - return 0; -} - - - -/** - * unify_entrylk_cbk - - */ -int -unify_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -/** - * unify_entrylk - */ -int -unify_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) - -{ - xlator_t *child = NULL; - - child = unify_loc_subvol (loc, this); - - STACK_WIND (frame, unify_entrylk_cbk, - child, child->fops->entrylk, - volume, loc, basename, cmd, type); - - return 0; -} - - - -/** - * unify_xattrop_cbk - - */ -int -unify_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) -{ - STACK_UNWIND (frame, op_ret, op_errno, xattr); - return 0; -} - -/** - * unify_xattrop - */ -int -unify_xattrop (call_frame_t *frame, xlator_t *this, - loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr) -{ - xlator_t *child = NULL; - - child = unify_loc_subvol (loc, this); - - STACK_WIND (frame, unify_xattrop_cbk, - child, child->fops->xattrop, - loc, optype, xattr); - - return 0; -} - -int -unify_forget (xlator_t *this, - inode_t *inode) -{ - int16_t *list = NULL; - uint64_t tmp_list = 0; - - if (inode->ia_type && (!IA_ISDIR(inode->ia_type))) { - inode_ctx_get (inode, this, &tmp_list); - if (tmp_list) { - list = (int16_t *)(long)tmp_list; - GF_FREE (list); - } - } - - return 0; -} - -/** - * notify - */ -int32_t -notify (xlator_t *this, - int32_t event, - void *data, - ...) -{ - unify_private_t *priv = this->private; - struct sched_ops *sched = NULL; - - if (!priv) { - return 0; - } - - sched = priv->sched_ops; - if (!sched) { - gf_log (this->name, GF_LOG_CRITICAL, "No scheduler :O"); - raise (SIGTERM); - return 0; - } - if (priv->namespace == data) { - if (event == GF_EVENT_CHILD_UP) { - sched->notify (this, event, data); - } - return 0; - } - - switch (event) - { - case GF_EVENT_CHILD_UP: - { - /* Call scheduler's update () to enable it for scheduling */ - sched->notify (this, event, data); - - LOCK (&priv->lock); - { - /* Increment the inode's generation, which is - used for self_heal */ - ++priv->inode_generation; - ++priv->num_child_up; - } - UNLOCK (&priv->lock); - - if (!priv->is_up) { - default_notify (this, event, data); - priv->is_up = 1; - } - } - break; - case GF_EVENT_CHILD_DOWN: - { - /* Call scheduler's update () to disable the child node - * for scheduling - */ - sched->notify (this, event, data); - LOCK (&priv->lock); - { - --priv->num_child_up; - } - UNLOCK (&priv->lock); - - if (priv->num_child_up == 0) { - /* Send CHILD_DOWN to upper layer */ - default_notify (this, event, data); - priv->is_up = 0; - } - } - break; - - default: - { - default_notify (this, event, data); - } - break; - } - - return 0; -} - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - if (!this) - return ret; - - ret = xlator_mem_acct_init (this, gf_unify_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } - - return ret; -} - -/** - * init - This function is called first in the xlator, while initializing. - * All the config file options are checked and appropriate flags are set. - * - * @this - - */ -int32_t -init (xlator_t *this) -{ - int32_t ret = 0; - int32_t count = 0; - data_t *scheduler = NULL; - data_t *data = NULL; - xlator_t *ns_xl = NULL; - xlator_list_t *trav = NULL; - xlator_list_t *xlparent = NULL; - xlator_list_t *parent = NULL; - unify_private_t *_private = NULL; - - - /* Check for number of child nodes, if there is no child nodes, exit */ - if (!this->children) { - gf_log (this->name, GF_LOG_ERROR, - "No child nodes specified. check \"subvolumes \" " - "option in volfile"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - /* Check for 'scheduler' in volume */ - scheduler = dict_get (this->options, "scheduler"); - if (!scheduler) { - gf_log (this->name, GF_LOG_ERROR, - "\"option scheduler <x>\" is missing in volfile"); - return -1; - } - - /* Setting "option namespace <node>" */ - data = dict_get (this->options, "namespace"); - if(!data) { - gf_log (this->name, GF_LOG_CRITICAL, - "namespace option not specified, Exiting"); - return -1; - } - /* Search namespace in the child node, if found, exit */ - trav = this->children; - while (trav) { - if (strcmp (trav->xlator->name, data->data) == 0) - break; - trav = trav->next; - } - if (trav) { - gf_log (this->name, GF_LOG_CRITICAL, - "namespace node used as a subvolume, Exiting"); - return -1; - } - - /* Search for the namespace node, if found, continue */ - ns_xl = this->next; - while (ns_xl) { - if (strcmp (ns_xl->name, data->data) == 0) - break; - ns_xl = ns_xl->next; - } - if (!ns_xl) { - gf_log (this->name, GF_LOG_CRITICAL, - "namespace node not found in volfile, Exiting"); - return -1; - } - - gf_log (this->name, GF_LOG_DEBUG, - "namespace node specified as %s", data->data); - - _private = GF_CALLOC (1, sizeof (*_private), - gf_unify_mt_unify_private_t); - ERR_ABORT (_private); - _private->sched_ops = get_scheduler (this, scheduler->data); - if (!_private->sched_ops) { - gf_log (this->name, GF_LOG_CRITICAL, - "Error while loading scheduler. Exiting"); - GF_FREE (_private); - return -1; - } - - if (ns_xl->parents) { - gf_log (this->name, GF_LOG_CRITICAL, - "Namespace node should not be a child of any other node. Exiting"); - GF_FREE (_private); - return -1; - } - - _private->namespace = ns_xl; - - /* update _private structure */ - { - count = 0; - trav = this->children; - /* Get the number of child count */ - while (trav) { - count++; - trav = trav->next; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Child node count is %d", count); - - _private->child_count = count; - if (count == 1) { - /* TODO: Should I error out here? */ - gf_log (this->name, GF_LOG_CRITICAL, - "WARNING: You have defined only one " - "\"subvolumes\" for unify volume. It may not " - "be the desired config, review your volume " - "volfile. If this is how you are testing it," - " you may hit some performance penalty"); - } - - _private->xl_array = GF_CALLOC (1, - sizeof (xlator_t) * (count + 1), - gf_unify_mt_xlator_t); - ERR_ABORT (_private->xl_array); - - count = 0; - trav = this->children; - while (trav) { - _private->xl_array[count++] = trav->xlator; - trav = trav->next; - } - _private->xl_array[count] = _private->namespace; - - /* self-heal part, start with generation '1' */ - _private->inode_generation = 1; - /* Because, Foreground part is tested well */ - _private->self_heal = ZR_UNIFY_FG_SELF_HEAL; - data = dict_get (this->options, "self-heal"); - if (data) { - if (strcasecmp (data->data, "off") == 0) - _private->self_heal = ZR_UNIFY_SELF_HEAL_OFF; - - if (strcasecmp (data->data, "foreground") == 0) - _private->self_heal = ZR_UNIFY_FG_SELF_HEAL; - - if (strcasecmp (data->data, "background") == 0) - _private->self_heal = ZR_UNIFY_BG_SELF_HEAL; - } - - /* optimist - ask bulde for more about it */ - data = dict_get (this->options, "optimist"); - if (data) { - if (gf_string2boolean (data->data, - &_private->optimist) == -1) { - gf_log (this->name, GF_LOG_ERROR, - "optimist excepts only boolean " - "options"); - } - } - - LOCK_INIT (&_private->lock); - } - - /* Now that everything is fine. */ - this->private = (void *)_private; - { - ret = _private->sched_ops->mem_acct_init (this); - - if (ret == -1) { - return -1; - } - - /* Initialize scheduler, if everything else is successful */ - ret = _private->sched_ops->init (this); - if (ret == -1) { - gf_log (this->name, GF_LOG_CRITICAL, - "Initializing scheduler failed, Exiting"); - GF_FREE (_private); - return -1; - } - - - ret = 0; - - /* This section is required because some fops may look - * for 'xl->parent' variable - */ - xlparent = GF_CALLOC (1, sizeof (*xlparent), - gf_unify_mt_xlator_list_t); - xlparent->xlator = this; - if (!ns_xl->parents) { - ns_xl->parents = xlparent; - } else { - parent = ns_xl->parents; - while (parent->next) - parent = parent->next; - parent->next = xlparent; - } - } - - /* Tell namespace node that init is done */ - xlator_notify (ns_xl, GF_EVENT_PARENT_UP, this); - - return 0; -} - -/** - * fini - Free all the allocated memory - */ -void -fini (xlator_t *this) -{ - unify_private_t *priv = this->private; - priv->sched_ops->fini (this); - this->private = NULL; - LOCK_DESTROY (&priv->lock); - GF_FREE (priv->xl_array); - GF_FREE (priv); - return; -} - - -struct xlator_fops fops = { - .stat = unify_stat, - .readlink = unify_readlink, - .mknod = unify_mknod, - .mkdir = unify_mkdir, - .unlink = unify_unlink, - .rmdir = unify_rmdir, - .symlink = unify_symlink, - .rename = unify_rename, - .link = unify_link, - .truncate = unify_truncate, - .create = unify_create, - .open = unify_open, - .readv = unify_readv, - .writev = unify_writev, - .statfs = unify_statfs, - .flush = unify_flush, - .fsync = unify_fsync, - .setxattr = unify_setxattr, - .getxattr = unify_getxattr, - .removexattr = unify_removexattr, - .opendir = unify_opendir, - .readdir = unify_readdir, - .readdirp = unify_readdirp, - .fsyncdir = unify_fsyncdir, - .access = unify_access, - .ftruncate = unify_ftruncate, - .fstat = unify_fstat, - .lk = unify_lk, - .lookup = unify_lookup, - .getdents = unify_getdents, - .checksum = unify_checksum, - .inodelk = unify_inodelk, - .finodelk = unify_finodelk, - .entrylk = unify_entrylk, - .fentrylk = unify_fentrylk, - .xattrop = unify_xattrop, - .fxattrop = unify_fxattrop, - .setattr = unify_setattr, - .fsetattr = unify_fsetattr, -}; - - -struct xlator_cbks cbks = { - .forget = unify_forget, -}; - -struct volume_options options[] = { - { .key = { "namespace" }, - .type = GF_OPTION_TYPE_XLATOR - }, - { .key = { "scheduler" }, - .value = { "alu", "rr", "random", "nufa", "switch" }, - .type = GF_OPTION_TYPE_STR - }, - { .key = {"self-heal"}, - .value = { "foreground", "background", "off" }, - .type = GF_OPTION_TYPE_STR - }, - /* TODO: remove it some time later */ - { .key = {"optimist"}, - .type = GF_OPTION_TYPE_BOOL - }, - - { .key = {NULL} }, -}; diff --git a/xlators/cluster/unify/src/unify.h b/xlators/cluster/unify/src/unify.h deleted file mode 100644 index dbd5e44a2..000000000 --- a/xlators/cluster/unify/src/unify.h +++ /dev/null @@ -1,146 +0,0 @@ -/* - Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#ifndef _UNIFY_H -#define _UNIFY_H - -#include "scheduler.h" -#include "list.h" -#include "unify-mem-types.h" - -#define MAX_DIR_ENTRY_STRING (32 * 1024) - -#define ZR_UNIFY_SELF_HEAL_OFF 0 -#define ZR_UNIFY_FG_SELF_HEAL 1 -#define ZR_UNIFY_BG_SELF_HEAL 2 - -/* Sometimes one should use completely random numbers.. its good :p */ -#define UNIFY_SELF_HEAL_GETDENTS_COUNT 512 - -#define NS(xl) (((unify_private_t *)xl->private)->namespace) - -/* This is used to allocate memory for local structure */ -#define INIT_LOCAL(fr, loc) \ -do { \ - loc = GF_CALLOC (1, sizeof (unify_local_t), gf_unify_mt_unify_local_t); \ - ERR_ABORT (loc); \ - if (!loc) { \ - STACK_UNWIND (fr, -1, ENOMEM); \ - return 0; \ - } \ - fr->local = loc; \ - loc->op_ret = -1; \ - loc->op_errno = ENOENT; \ -} while (0) - - - -struct unify_private { - /* Update this structure depending on requirement */ - void *scheduler; /* THIS SHOULD BE THE FIRST VARIABLE, - if xlator is using scheduler */ - struct sched_ops *sched_ops; /* Scheduler options */ - xlator_t *namespace; /* ptr to namespace xlator */ - xlator_t **xl_array; - gf_boolean_t optimist; - int16_t child_count; - int16_t num_child_up; - uint8_t self_heal; - uint8_t is_up; - uint64_t inode_generation; - gf_lock_t lock; -}; -typedef struct unify_private unify_private_t; - -struct unify_self_heal_struct { - uint8_t dir_checksum[NAME_MAX]; - uint8_t ns_dir_checksum[NAME_MAX]; - uint8_t file_checksum[NAME_MAX]; - uint8_t ns_file_checksum[NAME_MAX]; - off_t *offset_list; - int *count_list; - dir_entry_t **entry_list; -}; - - -struct _unify_local_t { - int32_t call_count; - int32_t op_ret; - int32_t op_errno; - mode_t mode; - off_t offset; - dev_t dev; - uid_t uid; - gid_t gid; - int32_t flags; - int32_t entry_count; - int32_t count; // dir_entry_t count; - fd_t *fd; - struct iatt stbuf; - struct iatt stpre; - struct iatt stpost; - struct statvfs statvfs_buf; - struct timespec tv[2]; - char *name; - int32_t revalidate; - - ino_t ia_ino; - nlink_t ia_nlink; - - dict_t *dict; - - int16_t *list; - int16_t *new_list; /* Used only in case of rename */ - int16_t index; - - int32_t failed; - int32_t return_eio; /* Used in case of different st-mode - present for a given path */ - - uint64_t inode_generation; /* used to store the per directory - * inode_generation. Got from inode's ctx - * of directory inodes - */ - - struct unify_self_heal_struct *sh_struct; - loc_t loc1, loc2; - - struct iatt poststbuf; - /* When not used for rename, old* - * are used as the attrs for the current - * parent directory. - */ - struct iatt oldpreparent; - struct iatt oldpostparent; - struct iatt newpreparent; - struct iatt newpostparent; - int32_t wbflags; -}; -typedef struct _unify_local_t unify_local_t; - -int32_t zr_unify_self_heal (call_frame_t *frame, - xlator_t *this, - unify_local_t *local); - -#endif /* _UNIFY_H */ |
