diff options
Diffstat (limited to 'xlators/cluster')
68 files changed, 21475 insertions, 15809 deletions
diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am index 16ed25af1..35d18a6c0 100644 --- a/xlators/cluster/afr/src/Makefile.am +++ b/xlators/cluster/afr/src/Makefile.am @@ -1,21 +1,31 @@ xlator_LTLIBRARIES = afr.la pump.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster -afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c afr-self-heal-algorithm.c afr-lk-common.c afr-self-heald.c $(top_builddir)/xlators/lib/src/libxlator.c +afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \ + afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c \ + afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c \ + afr-self-heal-algorithm.c afr-lk-common.c afr-self-heald.c \ + $(top_builddir)/xlators/lib/src/libxlator.c -afr_la_LDFLAGS = -module -avoidversion +afr_la_LDFLAGS = -module -avoid-version afr_la_SOURCES = $(afr_common_source) afr.c afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -pump_la_LDFLAGS = -module -avoidversion +pump_la_LDFLAGS = -module -avoid-version pump_la_SOURCES = $(afr_common_source) pump.c pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h afr-self-heal-algorithm.h pump.h afr-mem-types.h afr-common.c afr-self-heald.h $(top_builddir)/xlators/lib/src/libxlator.h $(top_builddir)/glusterfsd/src/glusterfsd.h +noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h \ + afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h \ + afr-self-heal-algorithm.h pump.h afr-mem-types.h afr-common.c \ + afr-self-heald.h $(top_builddir)/xlators/lib/src/libxlator.h \ + $(top_builddir)/glusterfsd/src/glusterfsd.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/contrib/md5 -shared -nostartfiles $(GF_CFLAGS) \ - -I$(top_srcdir)/xlators/lib/src +AM_CPPFLAGS = $(GF_CPPFLAGS) \ + -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ + -I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 5c085b611..af01f2ef2 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -44,6 +35,7 @@ #include "compat.h" #include "byte-order.h" #include "statedump.h" +#include "inode.h" #include "fd.h" @@ -57,10 +49,9 @@ #include "afr-self-heald.h" #include "pump.h" -#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000200000000ULL -#define AFR_ICTX_SPLIT_BRAIN_MASK 0x0000000100000000ULL +#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000100000000ULL #define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL - +#define AFR_STATISTICS_HISTORY_SIZE 50 int afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, gf_boolean_t fail_conflict); @@ -91,6 +82,11 @@ afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path) path, priv->pending_key[i]); /* 3 = data+metadata+entry */ } + ret = dict_set_int32 (xattr_req, GF_GFIDLESS_LOOKUP, 1); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "%s: failed to set gfidless " + "lookup", path); + } } int @@ -122,6 +118,13 @@ afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this, loc->path, GLUSTERFS_ENTRYLK_COUNT); } + ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_PARENT_ENTRYLK); + } + ret = dict_get_ptr (local->xattr_req, "gfid-req", gfid_req); if (ret) { gf_log (this->name, GF_LOG_DEBUG, @@ -137,14 +140,17 @@ out: } void -afr_lookup_save_gfid (uuid_t dst, void* new, inode_t *inode) +afr_lookup_save_gfid (uuid_t dst, void* new, const loc_t *loc) { - if (inode && !uuid_is_null (inode->gfid)) { + inode_t *inode = NULL; + + inode = loc->inode; + if (inode && !uuid_is_null (inode->gfid)) uuid_copy (dst, inode->gfid); - } else { - GF_ASSERT (new && !uuid_is_null (new)); + else if (!uuid_is_null (loc->gfid)) + uuid_copy (dst, loc->gfid); + else if (new && !uuid_is_null (new)) uuid_copy (dst, new); - } } int @@ -196,60 +202,86 @@ out: return ret; } -afr_inode_ctx_t* -afr_inode_ctx_get_from_addr (uint64_t addr, int32_t child_count) +void +afr_inode_ctx_destroy (afr_inode_ctx_t *ctx) { - int ret = -1; - afr_inode_ctx_t *ctx = NULL; - size_t size = 0; + if (!ctx) + return; + GF_FREE (ctx->fresh_children); + GF_FREE (ctx); +} - GF_ASSERT (child_count > 0); +afr_inode_ctx_t* +__afr_inode_ctx_get (inode_t *inode, xlator_t *this) +{ + int ret = 0; + uint64_t ctx_addr = 0; + afr_inode_ctx_t *ctx = NULL; + afr_private_t *priv = NULL; - if (!addr) { - ctx = GF_CALLOC (1, sizeof (*ctx), - gf_afr_mt_inode_ctx_t); - if (!ctx) - goto out; - size = sizeof (*ctx->fresh_children); - ctx->fresh_children = GF_CALLOC (child_count, size, - gf_afr_mt_int32_t); - if (!ctx->fresh_children) - goto out; - } else { - ctx = (afr_inode_ctx_t*) (long) addr; + priv = this->private; + ret = __inode_ctx_get (inode, this, &ctx_addr); + if (ret < 0) + ctx_addr = 0; + if (ctx_addr != 0) { + ctx = (afr_inode_ctx_t*) (long) ctx_addr; + goto out; } - ret = 0; + ctx = GF_CALLOC (1, sizeof (*ctx), + gf_afr_mt_inode_ctx_t); + if (!ctx) + goto fail; + ctx->fresh_children = GF_CALLOC (priv->child_count, + sizeof (*ctx->fresh_children), + gf_afr_mt_int32_t); + if (!ctx->fresh_children) + goto fail; + ret = __inode_ctx_put (inode, this, (uint64_t)ctx); + if (ret) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to " + "set the inode ctx (%s)", + uuid_utoa (inode->gfid)); + goto fail; + } + out: - if (ret && ctx) { - if (ctx->fresh_children) - GF_FREE (ctx->fresh_children); - GF_FREE (ctx); - ctx = NULL; + return ctx; + +fail: + afr_inode_ctx_destroy (ctx); + return NULL; +} + +afr_inode_ctx_t* +afr_inode_ctx_get (inode_t *inode, xlator_t *this) +{ + afr_inode_ctx_t *ctx = NULL; + + LOCK (&inode->lock); + { + ctx = __afr_inode_ctx_get (inode, this); } + UNLOCK (&inode->lock); return ctx; } void -afr_inode_get_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) +afr_inode_get_ctx_params (xlator_t *this, inode_t *inode, + afr_inode_params_t *params) { GF_ASSERT (inode); GF_ASSERT (params); - int ret = 0; afr_inode_ctx_t *ctx = NULL; afr_private_t *priv = NULL; int i = 0; - uint64_t ctx_addr = 0; int32_t read_child = -1; int32_t *fresh_children = NULL; priv = this->private; LOCK (&inode->lock); { - ret = __inode_ctx_get (inode, this, &ctx_addr); - if (ret < 0) - goto unlock; - ctx = afr_inode_ctx_get_from_addr (ctx_addr, priv->child_count); + ctx = __afr_inode_ctx_get (inode, this); if (!ctx) goto unlock; switch (params->op) { @@ -268,12 +300,6 @@ afr_inode_get_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) if (ctx->masks & AFR_ICTX_OPENDIR_DONE_MASK) params->u.value = _gf_true; break; - case AFR_INODE_GET_SPLIT_BRAIN: - params->u.value = _gf_false; - if (ctx->masks & AFR_ICTX_SPLIT_BRAIN_MASK) - params->u.value = _gf_true; - ; - break; default: GF_ASSERT (0); break; @@ -286,11 +312,16 @@ unlock: gf_boolean_t afr_is_split_brain (xlator_t *this, inode_t *inode) { - afr_inode_params_t params = {0}; + afr_inode_ctx_t *ctx = NULL; + gf_boolean_t spb = _gf_false; - params.op = AFR_INODE_GET_SPLIT_BRAIN; - afr_inode_get_ctx (this, inode, ¶ms); - return params.u.value; + ctx = afr_inode_ctx_get (inode, this); + if (!ctx) + goto out; + if ((ctx->mdata_spb == SPB) || (ctx->data_spb == SPB)) + spb = _gf_true; +out: + return spb; } gf_boolean_t @@ -299,11 +330,10 @@ afr_is_opendir_done (xlator_t *this, inode_t *inode) afr_inode_params_t params = {0}; params.op = AFR_INODE_GET_OPENDIR_DONE; - afr_inode_get_ctx (this, inode, ¶ms); + afr_inode_get_ctx_params (this, inode, ¶ms); return params.u.value; } - int32_t afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children) { @@ -311,7 +341,7 @@ afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children) params.op = AFR_INODE_GET_READ_CTX; params.u.read_ctx.children = fresh_children; - afr_inode_get_ctx (this, inode, ¶ms); + afr_inode_get_ctx_params (this, inode, ¶ms); return params.u.read_ctx.read_child; } @@ -321,7 +351,6 @@ afr_inode_ctx_set_read_child (afr_inode_ctx_t *ctx, int32_t read_child) uint64_t remaining_mask = 0; uint64_t mask = 0; - GF_ASSERT (read_child >= 0); remaining_mask = (~AFR_ICTX_READ_CHILD_MASK & ctx->masks); mask = (AFR_ICTX_READ_CHILD_MASK & read_child); ctx->masks = remaining_mask | mask; @@ -343,19 +372,23 @@ afr_inode_ctx_set_read_ctx (afr_inode_ctx_t *ctx, int32_t read_child, } void -afr_inode_ctx_rm_stale_children (afr_inode_ctx_t *ctx, int32_t read_child, - int32_t *stale_children, int32_t child_count) +afr_inode_ctx_rm_stale_children (afr_inode_ctx_t *ctx, int32_t *stale_children, + int32_t child_count) { int i = 0; + int32_t read_child = -1; GF_ASSERT (stale_children); - afr_inode_ctx_set_read_child (ctx, read_child); for (i = 0; i < child_count; i++) { - if ((ctx->fresh_children[i] == -1) || (stale_children[i] == -1)) + if (stale_children[i] == -1) break; afr_children_rm_child (ctx->fresh_children, stale_children[i], child_count); } + read_child = (int32_t)(ctx->masks & AFR_ICTX_READ_CHILD_MASK); + if (!afr_is_child_present (ctx->fresh_children, child_count, + read_child)) + afr_inode_ctx_set_read_child (ctx, ctx->fresh_children[0]); } void @@ -370,31 +403,14 @@ afr_inode_ctx_set_opendir_done (afr_inode_ctx_t *ctx) } void -afr_inode_ctx_set_splitbrain (afr_inode_ctx_t *ctx, gf_boolean_t set) -{ - uint64_t remaining_mask = 0; - uint64_t mask = 0; - - if (set) { - remaining_mask = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx->masks); - mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_SPLIT_BRAIN_MASK); - ctx->masks = remaining_mask | mask; - } else { - ctx->masks = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx->masks); - } -} - -void -afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) +afr_inode_set_ctx_params (xlator_t *this, inode_t *inode, + afr_inode_params_t *params) { GF_ASSERT (inode); GF_ASSERT (params); - int ret = 0; afr_inode_ctx_t *ctx = NULL; afr_private_t *priv = NULL; - uint64_t ctx_addr = 0; - gf_boolean_t set = _gf_false; int32_t read_child = -1; int32_t *fresh_children = NULL; int32_t *stale_children = NULL; @@ -402,10 +418,7 @@ afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) priv = this->private; LOCK (&inode->lock); { - ret = __inode_ctx_get (inode, this, &ctx_addr); - if (ret < 0) - ctx_addr = 0; - ctx = afr_inode_ctx_get_from_addr (ctx_addr, priv->child_count); + ctx = __afr_inode_ctx_get (inode, this); if (!ctx) goto unlock; switch (params->op) { @@ -417,42 +430,34 @@ afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) priv->child_count); break; case AFR_INODE_RM_STALE_CHILDREN: - read_child = params->u.read_ctx.read_child; stale_children = params->u.read_ctx.children; - afr_inode_ctx_rm_stale_children (ctx, read_child, + afr_inode_ctx_rm_stale_children (ctx, stale_children, priv->child_count); break; case AFR_INODE_SET_OPENDIR_DONE: afr_inode_ctx_set_opendir_done (ctx); break; - case AFR_INODE_SET_SPLIT_BRAIN: - set = params->u.value; - afr_inode_ctx_set_splitbrain (ctx, set); - break; default: GF_ASSERT (0); break; } - ret = __inode_ctx_put (inode, this, (uint64_t)ctx); - if (ret) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to " - "set the inode ctx (%s)", - uuid_utoa (inode->gfid)); - } } unlock: UNLOCK (&inode->lock); } void -afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set) +afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, + afr_spb_state_t data_spb) { - afr_inode_params_t params = {0}; + afr_inode_ctx_t *ctx = NULL; - params.op = AFR_INODE_SET_SPLIT_BRAIN; - params.u.value = set; - afr_inode_set_ctx (this, inode, ¶ms); + ctx = afr_inode_ctx_get (inode, this); + if (mdata_spb != DONT_KNOW) + ctx->mdata_spb = mdata_spb; + if (data_spb != DONT_KNOW) + ctx->data_spb = data_spb; } void @@ -461,7 +466,7 @@ afr_set_opendir_done (xlator_t *this, inode_t *inode) afr_inode_params_t params = {0}; params.op = AFR_INODE_SET_OPENDIR_DONE; - afr_inode_set_ctx (this, inode, ¶ms); + afr_inode_set_ctx_params (this, inode, ¶ms); } void @@ -480,22 +485,20 @@ afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, params.op = AFR_INODE_SET_READ_CTX; params.u.read_ctx.read_child = read_child; params.u.read_ctx.children = fresh_children; - afr_inode_set_ctx (this, inode, ¶ms); + afr_inode_set_ctx_params (this, inode, ¶ms); } void -afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, int32_t read_child, +afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, int32_t *stale_children) { afr_inode_params_t params = {0}; - GF_ASSERT (read_child >= 0); GF_ASSERT (stale_children); params.op = AFR_INODE_RM_STALE_CHILDREN; - params.u.read_ctx.read_child = read_child; params.u.read_ctx.children = stale_children; - afr_inode_set_ctx (this, inode, ¶ms); + afr_inode_set_ctx_params (this, inode, ¶ms); } gf_boolean_t @@ -539,6 +542,10 @@ afr_is_read_child (int32_t *success_children, int32_t *sources, gf_boolean_t success_child = _gf_false; gf_boolean_t source = _gf_false; + if (child < 0) { + return _gf_false; + } + GF_ASSERT (success_children); GF_ASSERT (child_count > 0); @@ -555,29 +562,69 @@ out: return (success_child && source); } +int32_t +afr_hash_child (int32_t *success_children, int32_t child_count, + unsigned int hmode, uuid_t gfid) +{ + uuid_t gfid_copy = {0,}; + pid_t pid; + + if (!hmode) { + return -1; + } + + if (gfid) { + uuid_copy(gfid_copy,gfid); + } + if (hmode > 1) { + /* + * Why getpid? Because it's one of the cheapest calls + * available - faster than gethostname etc. - and returns a + * constant-length value that's sure to be shorter than a UUID. + * It's still very unlikely to be the same across clients, so + * it still provides good mixing. We're not trying for + * perfection here. All we need is a low probability that + * multiple clients won't converge on the same subvolume. + */ + pid = getpid(); + memcpy (gfid_copy, &pid, sizeof(pid)); + } + + return SuperFastHash((char *)gfid_copy, + sizeof(gfid_copy)) % child_count; +} + /* If sources is NULL the xattrs are assumed to be of source for all * success_children. */ int -afr_select_read_child_from_policy (int32_t *success_children, int32_t child_count, - int32_t prev_read_child, - int32_t config_read_child, int32_t *sources) +afr_select_read_child_from_policy (int32_t *success_children, + int32_t child_count, int32_t prev_read_child, + int32_t config_read_child, int32_t *sources, + unsigned int hmode, uuid_t gfid) { int32_t read_child = -1; int i = 0; GF_ASSERT (success_children); - read_child = prev_read_child; + read_child = config_read_child; if (afr_is_read_child (success_children, sources, child_count, read_child)) goto out; - read_child = config_read_child; + read_child = prev_read_child; if (afr_is_read_child (success_children, sources, child_count, read_child)) goto out; + read_child = afr_hash_child (success_children, child_count, + hmode, gfid); + if (afr_is_read_child (success_children, sources, child_count, + read_child)) { + goto out; + } + for (i = 0; i < child_count; i++) { read_child = success_children[i]; if (read_child < 0) @@ -597,7 +644,7 @@ out: void afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, int32_t *fresh_children, int32_t prev_read_child, - int32_t config_read_child) + int32_t config_read_child, uuid_t gfid) { int read_child = -1; afr_private_t *priv = NULL; @@ -607,7 +654,8 @@ afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, priv->child_count, prev_read_child, config_read_child, - NULL); + NULL, + priv->hash_mode, gfid); if (read_child >= 0) afr_inode_set_read_ctx (this, inode, read_child, fresh_children); @@ -663,8 +711,11 @@ afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, GF_ASSERT (call_child); GF_ASSERT (last_index); GF_ASSERT (fresh_children); - GF_ASSERT (read_child >= 0); + if (read_child < 0) { + ret = -EIO; + goto out; + } priv = this->private; *call_child = -1; *last_index = -1; @@ -713,81 +764,66 @@ out: } void +afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count) +{ + afr_reset_xattr (xattr, child_count); + GF_FREE (xattr); +} + +void afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) { afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; - int i = 0; - sh = &local->self_heal; priv = this->private; - if (sh->buf) - GF_FREE (sh->buf); + if (sh->data_sh_info && strcmp (sh->data_sh_info, "")) + GF_FREE (sh->data_sh_info); + + if (sh->metadata_sh_info && strcmp (sh->metadata_sh_info, "")) + GF_FREE (sh->metadata_sh_info); + + GF_FREE (sh->buf); - if (sh->parentbufs) - GF_FREE (sh->parentbufs); + GF_FREE (sh->parentbufs); if (sh->inode) inode_unref (sh->inode); - if (sh->xattr) { - afr_reset_xattr (sh->xattr, priv->child_count); - GF_FREE (sh->xattr); - } - - if (sh->child_errno) - GF_FREE (sh->child_errno); + afr_xattr_array_destroy (sh->xattr, priv->child_count); - if (sh->pending_matrix) { - for (i = 0; i < priv->child_count; i++) { - GF_FREE (sh->pending_matrix[i]); - } - GF_FREE (sh->pending_matrix); - } + GF_FREE (sh->child_errno); - if (sh->delta_matrix) { - for (i = 0; i < priv->child_count; i++) { - GF_FREE (sh->delta_matrix[i]); - } - GF_FREE (sh->delta_matrix); - } + afr_matrix_cleanup (sh->pending_matrix, priv->child_count); + afr_matrix_cleanup (sh->delta_matrix, priv->child_count); - if (sh->sources) - GF_FREE (sh->sources); + GF_FREE (sh->sources); - if (sh->success) - GF_FREE (sh->success); + GF_FREE (sh->success); - if (sh->locked_nodes) - GF_FREE (sh->locked_nodes); + GF_FREE (sh->locked_nodes); if (sh->healing_fd) { fd_unref (sh->healing_fd); sh->healing_fd = NULL; } - if (sh->linkname) - GF_FREE ((char *)sh->linkname); + GF_FREE ((char *)sh->linkname); - if (sh->success_children) - GF_FREE (sh->success_children); + GF_FREE (sh->success_children); - if (sh->fresh_children) - GF_FREE (sh->fresh_children); + GF_FREE (sh->fresh_children); - if (sh->fresh_parent_dirs) - GF_FREE (sh->fresh_parent_dirs); + GF_FREE (sh->fresh_parent_dirs); loc_wipe (&sh->parent_loc); loc_wipe (&sh->lookup_loc); - if (sh->checksum) - GF_FREE (sh->checksum); + GF_FREE (sh->checksum); - if (sh->write_needed) - GF_FREE (sh->write_needed); + GF_FREE (sh->write_needed); if (sh->healing_fd) fd_unref (sh->healing_fd); } @@ -796,34 +832,26 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) void afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) { - int i = 0; - afr_private_t * priv = NULL; + afr_private_t *priv = NULL; + int i = 0; priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (local->pending && local->pending[i]) - GF_FREE (local->pending[i]); - } - - GF_FREE (local->pending); - - if (local->internal_lock.locked_nodes) - GF_FREE (local->internal_lock.locked_nodes); + afr_matrix_cleanup (local->pending, priv->child_count); + afr_matrix_cleanup (local->transaction.txn_changelog, + priv->child_count); - if (local->internal_lock.inode_locked_nodes) - GF_FREE (local->internal_lock.inode_locked_nodes); + GF_FREE (local->internal_lock.locked_nodes); - if (local->internal_lock.entry_locked_nodes) - GF_FREE (local->internal_lock.entry_locked_nodes); + for (i = 0; local->internal_lock.inodelk[i].domain; i++) { + GF_FREE (local->internal_lock.inodelk[i].locked_nodes); + } - if (local->internal_lock.lower_locked_nodes) - GF_FREE (local->internal_lock.lower_locked_nodes); + GF_FREE (local->internal_lock.lower_locked_nodes); + afr_entry_lockee_cleanup (&local->internal_lock); GF_FREE (local->transaction.pre_op); - GF_FREE (local->transaction.child_errno); - GF_FREE (local->child_errno); GF_FREE (local->transaction.eager_lock); GF_FREE (local->transaction.basename); @@ -831,6 +859,8 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) loc_wipe (&local->transaction.parent_loc); loc_wipe (&local->transaction.new_parent_loc); + + GF_FREE (local->transaction.postop_piggybacked); } @@ -857,14 +887,16 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (local->xattr_req) dict_unref (local->xattr_req); - if (local->child_up) - GF_FREE (local->child_up); + if (local->dict) + dict_unref (local->dict); + + GF_FREE(local->replies); - if (local->fresh_children) - GF_FREE (local->fresh_children); + GF_FREE (local->child_up); - if (local->fd_open_on) - GF_FREE (local->fd_open_on); + GF_FREE (local->child_errno); + + GF_FREE (local->fresh_children); { /* lookup */ if (local->cont.lookup.xattrs) { @@ -882,27 +914,23 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) inode_unref (local->cont.lookup.inode); } - if (local->cont.lookup.postparents) - GF_FREE (local->cont.lookup.postparents); + GF_FREE (local->cont.lookup.postparents); - if (local->cont.lookup.bufs) - GF_FREE (local->cont.lookup.bufs); + GF_FREE (local->cont.lookup.bufs); - if (local->cont.lookup.success_children) - GF_FREE (local->cont.lookup.success_children); + GF_FREE (local->cont.lookup.success_children); - if (local->cont.lookup.sources) - GF_FREE (local->cont.lookup.sources); + GF_FREE (local->cont.lookup.sources); + afr_matrix_cleanup (local->cont.lookup.pending_matrix, + priv->child_count); } { /* getxattr */ - if (local->cont.getxattr.name) - GF_FREE (local->cont.getxattr.name); + GF_FREE (local->cont.getxattr.name); } { /* lk */ - if (local->cont.lk.locked_nodes) - GF_FREE (local->cont.lk.locked_nodes); + GF_FREE (local->cont.lk.locked_nodes); } { /* create */ @@ -936,18 +964,40 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) dict_unref (local->cont.setxattr.dict); } + { /* fsetxattr */ + if (local->cont.fsetxattr.dict) + dict_unref (local->cont.fsetxattr.dict); + } + { /* removexattr */ GF_FREE (local->cont.removexattr.name); } - + { /* xattrop */ + if (local->cont.xattrop.xattr) + dict_unref (local->cont.xattrop.xattr); + } + { /* fxattrop */ + if (local->cont.fxattrop.xattr) + dict_unref (local->cont.fxattrop.xattr); + } { /* symlink */ GF_FREE (local->cont.symlink.linkpath); } { /* opendir */ - if (local->cont.opendir.checksum) - GF_FREE (local->cont.opendir.checksum); + GF_FREE (local->cont.opendir.checksum); } + + { /* readdirp */ + if (local->cont.readdir.dict) + dict_unref (local->cont.readdir.dict); + } + + if (local->xdata_req) + dict_unref (local->xdata_req); + + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); } @@ -1030,33 +1080,144 @@ afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) uuid_copy (loc->pargfid, postparent->ia_gfid); } +/* + * Quota size xattrs are not maintained by afr. There is a + * possibility that they differ even when both the directory changelog xattrs + * suggest everything is fine. So if there is at least one 'source' check among + * the sources which has the maximum quota size. Otherwise check among all the + * available ones for maximum quota size. This way if there is a source and + * stale copies it always votes for the 'source'. + * */ + +static void +afr_handle_quota_size (afr_local_t *local, xlator_t *this, + dict_t *rsp_dict) +{ + int32_t *sources = NULL; + dict_t *xattr = NULL; + data_t *max_data = NULL; + int64_t max_quota_size = -1; + data_t *data = NULL; + int64_t *size = NULL; + int64_t quota_size = -1; + afr_private_t *priv = NULL; + int i = 0; + int ret = -1; + gf_boolean_t source_present = _gf_false; + + priv = this->private; + sources = local->cont.lookup.sources; + + if (rsp_dict == NULL) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "%s: Invalid " + "response dictionary", local->loc.path); + return; + } + + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source_present = _gf_true; + break; + } + } + + for (i = 0; i < priv->child_count; i++) { + /* + * If there is at least one source lets check + * for maximum quota sizes among sources, otherwise take the + * maximum of the ones present to be on the safer side. + */ + if (source_present && !sources[i]) + continue; + + xattr = local->cont.lookup.xattrs[i]; + if (!xattr) + continue; + + data = dict_get (xattr, QUOTA_SIZE_KEY); + if (!data) + continue; + + size = (int64_t*)data->data; + quota_size = ntoh64(*size); + gf_log (this->name, GF_LOG_DEBUG, "%s: %d, size: %"PRId64, + local->loc.path, i, quota_size); + if (quota_size > max_quota_size) { + if (max_data) + data_unref (max_data); + + max_quota_size = quota_size; + max_data = data_ref (data); + } + } + + if (max_data) { + ret = dict_set (rsp_dict, QUOTA_SIZE_KEY, max_data); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " + "quota size", local->loc.path); + } + + data_unref (max_data); + } +} + int afr_lookup_build_response_params (afr_local_t *local, xlator_t *this) { - int32_t read_child = -1; struct iatt *buf = NULL; struct iatt *postparent = NULL; dict_t **xattr = NULL; + int32_t *success_children = NULL; + int32_t *sources = NULL; + afr_private_t *priv = NULL; + int32_t read_child = -1; int ret = 0; + int i = 0; GF_ASSERT (local); buf = &local->cont.lookup.buf; postparent = &local->cont.lookup.postparent; xattr = &local->cont.lookup.xattr; + priv = this->private; read_child = afr_inode_get_read_ctx (this, local->cont.lookup.inode, - NULL); + local->fresh_children); if (read_child < 0) { ret = -1; goto out; } + success_children = local->cont.lookup.success_children; + sources = local->cont.lookup.sources; + memset (sources, 0, sizeof (*sources) * priv->child_count); + afr_children_intersection_get (local->fresh_children, success_children, + sources, priv->child_count); + if (!sources[read_child]) { + read_child = -1; + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + read_child = i; + break; + } + } + } + if (read_child < 0) { + ret = -1; + goto out; + } + gf_log (this->name, GF_LOG_DEBUG, "Building lookup response from %d", read_child); - *xattr = dict_ref (local->cont.lookup.xattrs[read_child]); + if (!*xattr) + *xattr = dict_ref (local->cont.lookup.xattrs[read_child]); + *buf = local->cont.lookup.bufs[read_child]; *postparent = local->cont.lookup.postparents[read_child]; + if (dict_get (local->xattr_req, QUOTA_SIZE_KEY)) + afr_handle_quota_size (local, this, *xattr); + if (IA_INVAL == local->cont.lookup.inode->ia_type) { /* fix for RT #602 */ local->cont.lookup.inode->ia_type = buf->ia_type; @@ -1072,6 +1233,7 @@ afr_lookup_update_lk_counts (afr_local_t *local, xlator_t *this, uint32_t inodelk_count = 0; uint32_t entrylk_count = 0; int ret = -1; + uint32_t parent_entrylk = 0; GF_ASSERT (local); GF_ASSERT (this); @@ -1087,43 +1249,103 @@ afr_lookup_update_lk_counts (afr_local_t *local, xlator_t *this, &entrylk_count); if (ret == 0) local->entrylk_count += entrylk_count; + ret = dict_get_uint32 (xattr, GLUSTERFS_PARENT_ENTRYLK, + &parent_entrylk); + if (!ret) + local->cont.lookup.parent_entrylk += parent_entrylk; } +/* + * It's important to maintain a commutative property on do_*_self_heal and + * found*; once set, they must not be cleared by a subsequent iteration or + * call, so that they represent a logical OR of all iterations and calls + * regardless of child/key order. That allows the caller to call us multiple + * times without having to use a separate variable as a "reduce" accumulator. + */ static void afr_lookup_set_self_heal_params_by_xattr (afr_local_t *local, xlator_t *this, dict_t *xattr) { + afr_private_t *priv = NULL; + int i = 0; + int ret = -1; + void *pending_raw = NULL; + int32_t *pending = NULL; + GF_ASSERT (local); GF_ASSERT (this); GF_ASSERT (xattr); - if (afr_sh_has_metadata_pending (xattr, this)) { - local->self_heal.do_metadata_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "metadata self-heal is pending for %s.", - local->loc.path); - } + priv = this->private; - if (afr_sh_has_entry_pending (xattr, this)) { - local->self_heal.do_entry_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "entry self-heal is pending for %s.", local->loc.path); - } + for (i = 0; i < priv->child_count; i++) { + ret = dict_get_ptr (xattr, priv->pending_key[i], + &pending_raw); + if (ret != 0) { + continue; + } + pending = pending_raw; - if (afr_sh_has_data_pending (xattr, this)) { - local->self_heal.do_data_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "data self-heal is pending for %s.", local->loc.path); + if (pending[AFR_METADATA_TRANSACTION]) { + gf_log(this->name, GF_LOG_DEBUG, + "metadata self-heal is pending for %s.", + local->loc.path); + local->self_heal.do_metadata_self_heal = _gf_true; + } + + if (pending[AFR_ENTRY_TRANSACTION]) { + gf_log(this->name, GF_LOG_DEBUG, + "entry self-heal is pending for %s.", + local->loc.path); + local->self_heal.do_entry_self_heal = _gf_true; + } + + if (pending[AFR_DATA_TRANSACTION]) { + gf_log(this->name, GF_LOG_DEBUG, + "data self-heal is pending for %s.", + local->loc.path); + local->self_heal.do_data_self_heal = _gf_true; + } } } +void +afr_lookup_check_set_metadata_split_brain (afr_local_t *local, xlator_t *this) +{ + int32_t *sources = NULL; + afr_private_t *priv = NULL; + int32_t subvol_status = 0; + int32_t *success_children = NULL; + dict_t **xattrs = NULL; + struct iatt *bufs = NULL; + int32_t **pending_matrix = NULL; + + priv = this->private; + + sources = GF_CALLOC (priv->child_count, sizeof (*sources), + gf_afr_mt_int32_t); + if (NULL == sources) + goto out; + success_children = local->cont.lookup.success_children; + xattrs = local->cont.lookup.xattrs; + bufs = local->cont.lookup.bufs; + pending_matrix = local->cont.lookup.pending_matrix; + afr_build_sources (this, xattrs, bufs, pending_matrix, + sources, success_children, AFR_METADATA_TRANSACTION, + &subvol_status, _gf_false); + if (subvol_status & SPLIT_BRAIN) + local->cont.lookup.possible_spb = _gf_true; +out: + GF_FREE (sources); +} + static void afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this, struct iatt *buf, struct iatt *lookup_buf) { if (PERMISSION_DIFFERS (buf, lookup_buf)) { /* mismatching permissions */ - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_DEBUG, "permissions differ for %s ", local->loc.path); local->self_heal.do_metadata_self_heal = _gf_true; } @@ -1131,27 +1353,45 @@ afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this, if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { /* mismatching permissions */ local->self_heal.do_metadata_self_heal = _gf_true; - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_DEBUG, "ownership differs for %s ", local->loc.path); } if (SIZE_DIFFERS (buf, lookup_buf) && IA_ISREG (buf->ia_type)) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_DEBUG, "size differs for %s ", local->loc.path); local->self_heal.do_data_self_heal = _gf_true; } if (uuid_compare (buf->ia_gfid, lookup_buf->ia_gfid)) { /* mismatching gfid */ - gf_log (this->name, GF_LOG_WARNING, + gf_log (this->name, GF_LOG_DEBUG, "%s: gfid different on subvolume", local->loc.path); } } static void -afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this, - gf_boolean_t split_brain) +afr_detect_self_heal_by_split_brain_status (afr_local_t *local, xlator_t *this) +{ + gf_boolean_t split_brain = _gf_false; + afr_self_heal_t *sh = NULL; + + sh = &local->self_heal; + + split_brain = afr_is_split_brain (this, local->cont.lookup.inode); + split_brain = split_brain || local->cont.lookup.possible_spb; + if ((local->success_count > 0) && split_brain && + IA_ISREG (local->cont.lookup.inode->ia_type)) { + sh->force_confirm_spb = _gf_true; + gf_log (this->name, GF_LOG_DEBUG, + "split brain detected during lookup of %s.", + local->loc.path); + } +} + +static void +afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this) { GF_ASSERT (local); GF_ASSERT (this); @@ -1162,24 +1402,11 @@ afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this, local->self_heal.do_entry_self_heal = _gf_true; local->self_heal.do_gfid_self_heal = _gf_true; local->self_heal.do_missing_entry_self_heal = _gf_true; - gf_log(this->name, GF_LOG_INFO, + gf_log(this->name, GF_LOG_DEBUG, "entries are missing in lookup of %s.", local->loc.path); - //If all self-heals are needed no need to check for other rules - goto out; - } - - if ((local->success_count > 0) && split_brain && - IA_ISREG (local->cont.lookup.inode->ia_type)) { - local->self_heal.do_data_self_heal = _gf_true; - local->self_heal.do_gfid_self_heal = _gf_true; - local->self_heal.do_missing_entry_self_heal = _gf_true; - gf_log (this->name, GF_LOG_WARNING, - "split brain detected during lookup of %s.", - local->loc.path); } -out: return; } @@ -1189,6 +1416,8 @@ afr_can_self_heal_proceed (afr_self_heal_t *sh, afr_private_t *priv) GF_ASSERT (sh); GF_ASSERT (priv); + if (sh->force_confirm_spb) + return _gf_true; return (sh->do_gfid_self_heal || sh->do_missing_entry_self_heal || (afr_data_self_heal_enabled (priv->data_self_heal) && @@ -1222,6 +1451,7 @@ afr_lookup_select_read_child (afr_local_t *local, xlator_t *this, dict_t **xattrs = NULL; int32_t *success_children = NULL; afr_transaction_type type = AFR_METADATA_TRANSACTION; + uuid_t *gfid = NULL; GF_ASSERT (local); GF_ASSERT (this); @@ -1235,8 +1465,9 @@ afr_lookup_select_read_child (afr_local_t *local, xlator_t *this, ia_type = local->cont.lookup.bufs[success_children[0]].ia_type; type = afr_transaction_type_get (ia_type); xattrs = local->cont.lookup.xattrs; + gfid = &local->cont.lookup.buf.ia_gfid; source = afr_lookup_select_read_child_by_txn_type (this, local, xattrs, - type); + type, *gfid); if (source < 0) { gf_log (this->name, GF_LOG_DEBUG, "failed to select source " "for %s", local->loc.path); @@ -1264,7 +1495,8 @@ afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this), int (*unwind) (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno)) + int32_t op_ret, int32_t op_errno, + int32_t sh_failed)) { afr_local_t *local = NULL; char sh_type_str[256] = {0,}; @@ -1287,7 +1519,7 @@ afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, if (background) bg = "background"; - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_DEBUG, "%s %s self-heal triggered. path: %s, reason: %s", bg, sh_type_str, local->loc.path, reason); @@ -1358,7 +1590,7 @@ afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, child2 = &bufs[success_children[i-1]]; if (FILETYPE_DIFFERS (child1, child2)) { - gf_log (xlator_name, GF_LOG_WARNING, "%s: filetype " + gf_log (xlator_name, GF_LOG_DEBUG, "%s: filetype " "differs on subvolumes (%d, %d)", path, success_children[i-1], success_children[i]); conflicting = _gf_true; @@ -1367,7 +1599,7 @@ afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, if (!gfid || uuid_is_null (child1->ia_gfid)) continue; if (uuid_compare (*gfid, child1->ia_gfid)) { - gf_log (xlator_name, GF_LOG_WARNING, "%s: gfid differs" + gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid differs" " on subvolume %d", path, success_children[i]); conflicting = _gf_true; goto out; @@ -1450,13 +1682,11 @@ afr_lookup_set_self_heal_params (afr_local_t *local, xlator_t *this) int32_t child1 = -1; int32_t child2 = -1; afr_self_heal_t *sh = NULL; - gf_boolean_t split_brain = _gf_false; priv = this->private; sh = &local->self_heal; - split_brain = afr_is_split_brain (this, local->cont.lookup.inode); - afr_detect_self_heal_by_lookup_status (local, this, split_brain); + afr_detect_self_heal_by_lookup_status (local, this); if (afr_lookup_gfid_missing_count (local, this)) local->self_heal.do_gfid_self_heal = _gf_true; @@ -1483,23 +1713,28 @@ afr_lookup_set_self_heal_params (afr_local_t *local, xlator_t *this) afr_lookup_set_self_heal_params_by_xattr (local, this, xattr[child1]); } - if (afr_open_only_data_self_heal (priv->data_self_heal) - && !split_brain) + if (afr_open_only_data_self_heal (priv->data_self_heal)) sh->do_data_self_heal = _gf_false; + if (sh->do_metadata_self_heal) + afr_lookup_check_set_metadata_split_brain (local, this); + afr_detect_self_heal_by_split_brain_status (local, this); } int afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, + int32_t sh_failed) { afr_local_t *local = NULL; + int ret = -1; + dict_t *xattr = NULL; local = frame->local; if (op_ret == -1) { local->op_ret = -1; - if (afr_error_more_important (local->op_errno, op_errno)) - local->op_errno = op_errno; + local->op_errno = afr_most_important_error(local->op_errno, + op_errno, _gf_true); goto out; } else { @@ -1507,6 +1742,23 @@ afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, } afr_lookup_done_success_action (frame, this, _gf_true); + xattr = local->cont.lookup.xattr; + if (xattr) { + ret = dict_set_int32 (xattr, "sh-failed", sh_failed); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " + "sh-failed to %d", local->loc.path, sh_failed); + + if (local->self_heal.actual_sh_started == _gf_true && + sh_failed == 0) { + ret = dict_set_int32 (xattr, "actual-sh-done", 1); + if (ret) + gf_log(this->name, GF_LOG_ERROR, "%s: Failed to" + " set actual-sh-done to %d", + local->loc.path, + local->self_heal.actual_sh_started); + } + } out: AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, local->cont.lookup.inode, &local->cont.lookup.buf, @@ -1580,7 +1832,8 @@ afr_lookup_perform_self_heal (call_frame_t *frame, xlator_t *this, afr_lookup_set_self_heal_params (local, this); if (afr_can_self_heal_proceed (&local->self_heal, priv)) { - if (afr_is_transaction_running (local)) + if (afr_is_transaction_running (local) && + (!local->allow_sh_for_running_transaction)) goto out; reason = "lookup detected pending operations"; @@ -1641,26 +1894,23 @@ afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, int32_t read_child = -1; int32_t ret = -1; afr_local_t *local = NULL; - afr_private_t *priv = NULL; + gf_boolean_t fresh_lookup = _gf_false; local = frame->local; - priv = this->private; + fresh_lookup = local->cont.lookup.fresh_lookup; if (local->loc.parent == NULL) fail_conflict = _gf_true; - if (afr_conflicting_iattrs (local->cont.lookup.bufs, - local->cont.lookup.success_children, - priv->child_count, local->loc.path, - this->name)) { + if (afr_lookup_conflicting_entries (local, this)) { if (fail_conflict == _gf_false) ret = 0; goto out; } - if (!afr_is_transaction_running (local)) { - ret = afr_lookup_select_read_child (local, this, &read_child); - if (ret) + ret = afr_lookup_select_read_child (local, this, &read_child); + if (!afr_is_transaction_running (local) || fresh_lookup) { + if (read_child < 0) goto out; ret = afr_lookup_set_read_ctx (local, this, read_child); @@ -1671,11 +1921,9 @@ afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, ret = afr_lookup_build_response_params (local, this); if (ret) goto out; - if (afr_is_fresh_lookup (&local->loc, this)) { - afr_update_loc_gfids (&local->loc, - &local->cont.lookup.buf, - &local->cont.lookup.postparent); - } + afr_update_loc_gfids (&local->loc, + &local->cont.lookup.buf, + &local->cont.lookup.postparent); ret = 0; out: @@ -1686,6 +1934,135 @@ out: return ret; } +int +afr_lookup_get_latest_subvol (afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = NULL; + int32_t *success_children = NULL; + struct iatt *bufs = NULL; + int i = 0; + int child = 0; + int lsubvol = -1; + + priv = this->private; + success_children = local->cont.lookup.success_children; + bufs = local->cont.lookup.bufs; + for (i = 0; i < priv->child_count; i++) { + child = success_children[i]; + if (child == -1) + break; + if (uuid_is_null (bufs[child].ia_gfid)) + continue; + if (lsubvol < 0) { + lsubvol = child; + } else if (bufs[lsubvol].ia_ctime < bufs[child].ia_ctime) { + lsubvol = child; + } else if ((bufs[lsubvol].ia_ctime == bufs[child].ia_ctime) && + (bufs[lsubvol].ia_ctime_nsec < bufs[child].ia_ctime_nsec)) { + lsubvol = child; + } + } + return lsubvol; +} + +void +afr_lookup_mark_other_entries_stale (afr_local_t *local, xlator_t *this, + int subvol) +{ + afr_private_t *priv = NULL; + int32_t *success_children = NULL; + struct iatt *bufs = NULL; + int i = 0; + int child = 0; + + priv = this->private; + success_children = local->cont.lookup.success_children; + bufs = local->cont.lookup.bufs; + memcpy (local->fresh_children, success_children, + sizeof (*success_children) * priv->child_count); + for (i = 0; i < priv->child_count; i++) { + child = local->fresh_children[i]; + if (child == -1) + break; + if (child == subvol) + continue; + if (uuid_is_null (bufs[child].ia_gfid) && + (bufs[child].ia_type == bufs[subvol].ia_type)) + continue; + afr_children_rm_child (success_children, child, + priv->child_count); + local->success_count--; + } + afr_reset_children (local->fresh_children, priv->child_count); +} + +void +afr_succeed_lookup_on_latest_iatt (afr_local_t *local, xlator_t *this) +{ + int lsubvol = 0; + + if (!afr_lookup_conflicting_entries (local, this)) + goto out; + + lsubvol = afr_lookup_get_latest_subvol (local, this); + if (lsubvol < 0) + goto out; + afr_lookup_mark_other_entries_stale (local, this, lsubvol); +out: + return; +} + +gf_boolean_t +afr_is_entry_possibly_under_creation (afr_local_t *local, xlator_t *this) +{ + /* + * We need to perform this test in lookup done and treat on going + * create/DELETE as ENOENT. + * Reason: + Multiple clients A, B and C are attempting 'mkdir -p /mnt/a/b/c' + + 1 Client A is in the middle of mkdir(/a). It has acquired lock. + It has performed mkdir(/a) on one subvol, and second one is still + in progress + 2 Client B performs a lookup, sees directory /a on one, + ENOENT on the other, succeeds lookup. + 3 Client B performs lookup on /a/b on both subvols, both return ENOENT + (one subvol because /a/b does not exist, another because /a + itself does not exist) + 4 Client B proceeds to mkdir /a/b. It obtains entrylk on inode=/a with + basename=b on one subvol, but fails on other subvol as /a is yet to + be created by Client A. + 5 Client A finishes mkdir of /a on other subvol + 6 Client C also attempts to create /a/b, lookup returns ENOENT on + both subvols. + 7 Client C tries to obtain entrylk on on inode=/a with basename=b, + obtains on one subvol (where B had failed), and waits for B to unlock + on other subvol. + 8 Client B finishes mkdir() on one subvol with GFID-1 and completes + transaction and unlocks + 9 Client C gets the lock on the second subvol, At this stage second + subvol already has /a/b created from Client B, but Client C does not + check that in the middle of mkdir transaction + 10 Client C attempts mkdir /a/b on both subvols. It succeeds on + ONLY ONE (where Client B could not get lock because of + missing parent /a dir) with GFID-2, and gets EEXIST from ONE subvol. + This way we have /a/b in GFID mismatch. One subvol got GFID-1 because + Client B performed transaction on only one subvol (because entrylk() + could not be obtained on second subvol because of missing parent dir -- + caused by premature/speculative succeeding of lookup() on /a when locks + are detected). Other subvol gets GFID-2 from Client C because while + it was waiting for entrylk() on both subvols, Client B was in the + middle of creating mkdir() on only one subvol, and Client C does not + "expect" this when it is between lock() and pre-op()/op() phase of the + transaction. + */ + if (local->cont.lookup.parent_entrylk && local->enoent_count) + return _gf_true; + + return _gf_false; +} + + static void afr_lookup_done (call_frame_t *frame, xlator_t *this) { @@ -1694,6 +2071,7 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; int ret = -1; gf_boolean_t sh_launched = _gf_false; + gf_boolean_t fail_conflict = _gf_false; int gfid_miss_count = 0; int enotconn_count = 0; int up_children_count = 0; @@ -1701,8 +2079,18 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) priv = this->private; local = frame->local; + if (afr_is_entry_possibly_under_creation (local, this)) { + local->op_ret = -1; + local->op_errno = ENOENT; + goto unwind; + } + if (local->op_ret < 0) goto unwind; + + if (local->cont.lookup.parent_entrylk && local->success_count > 1) + afr_succeed_lookup_on_latest_iatt (local, this); + gfid_miss_count = afr_lookup_gfid_missing_count (local, this); up_children_count = afr_up_children_count (local->child_up, priv->child_count); @@ -1717,7 +2105,18 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) goto unwind; } - ret = afr_lookup_done_success_action (frame, this, _gf_false); + if ((gfid_miss_count == local->success_count) && + uuid_is_null (local->cont.lookup.gfid_req)) { + local->op_ret = -1; + local->op_errno = ENODATA; + gf_log (this->name, GF_LOG_ERROR, "%s: No gfid present", + local->loc.path); + goto unwind; + } + + if (gfid_miss_count && uuid_is_null (local->cont.lookup.gfid_req)) + fail_conflict = _gf_true; + ret = afr_lookup_done_success_action (frame, this, fail_conflict); if (ret) goto unwind; uuid_copy (local->self_heal.sh_gfid_req, local->cont.lookup.gfid_req); @@ -1743,24 +2142,20 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) * others in that they must be given higher priority while * returning to the user. * - * The hierarchy is ESTALE > ENOENT > others - * + * The hierarchy is ESTALE > EIO > ENOENT > others */ - -gf_boolean_t -afr_error_more_important (int32_t old_errno, int32_t new_errno) +int32_t +afr_most_important_error(int32_t old_errno, int32_t new_errno, + gf_boolean_t eio) { - gf_boolean_t ret = _gf_true; - - /* Nothing should ever overwrite ESTALE */ - if (old_errno == ESTALE) - ret = _gf_false; - - /* Nothing should overwrite ENOENT, except ESTALE */ - else if ((old_errno == ENOENT) && (new_errno != ESTALE)) - ret = _gf_false; - - return ret; + if (old_errno == ESTALE || new_errno == ESTALE) + return ESTALE; + if (eio && (old_errno == EIO || new_errno == EIO)) + return EIO; + if (old_errno == ENOENT || new_errno == ENOENT) + return ENOENT; + + return new_errno; } int32_t @@ -1779,8 +2174,9 @@ afr_resultant_errno_get (int32_t *children, } else { child = i; } - if (afr_error_more_important (op_errno, child_errno[child])) - op_errno = child_errno[child]; + op_errno = afr_most_important_error(op_errno, + child_errno[child], + _gf_false); } return op_errno; } @@ -1792,8 +2188,8 @@ afr_lookup_handle_error (afr_local_t *local, int32_t op_ret, int32_t op_errno) if (op_errno == ENOENT) local->enoent_count++; - if (afr_error_more_important (local->op_errno, op_errno)) - local->op_errno = op_errno; + local->op_errno = afr_most_important_error(local->op_errno, op_errno, + _gf_false); if (local->op_errno == ESTALE) { local->op_ret = -1; @@ -1807,7 +2203,7 @@ afr_set_root_inode_on_first_lookup (afr_local_t *local, xlator_t *this, afr_private_t *priv = NULL; GF_ASSERT (inode); - if (inode->ino != 1) + if (!__is_root_gfid (inode->gfid)) goto out; if (!afr_is_fresh_lookup (&local->loc, this)) goto out; @@ -1840,12 +2236,79 @@ afr_lookup_handle_first_success (afr_local_t *local, xlator_t *this, afr_set_root_inode_on_first_lookup (local, this, inode); } +static int32_t +afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + int ret = 0; + char *pathinfo = NULL; + gf_boolean_t is_local = _gf_false; + afr_private_t *priv = NULL; + int32_t child_index = -1; + + if (op_ret != 0) { + goto out; + } + + ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); + if (ret != 0) { + goto out; + } + + ret = afr_local_pathinfo (pathinfo, &is_local); + if (ret) { + goto out; + } + + priv = this->private; + /* + * Note that one local subvolume will override another here. The only + * way to avoid that would be to retain extra information about whether + * the previous read_child is local, and it's just not worth it. Even + * the slowest local subvolume is far preferable to a remote one. + */ + if (is_local) { + child_index = (int32_t)(long)cookie; + gf_log (this->name, GF_LOG_INFO, + "selecting local read_child %s", + priv->children[child_index]->name); + priv->read_child = child_index; + } + +out: + STACK_DESTROY(frame->root); + return 0; +} + +static void +afr_attempt_local_discovery (xlator_t *this, int32_t child_index) +{ + call_frame_t *newframe = NULL; + loc_t tmploc = {0,}; + afr_private_t *priv = this->private; + + newframe = create_frame(this,this->ctx->pool); + if (!newframe) { + return; + } + + tmploc.gfid[sizeof(tmploc.gfid)-1] = 1; + STACK_WIND_COOKIE (newframe, afr_discovery_cbk, + (void *)(long)child_index, + priv->children[child_index], + priv->children[child_index]->fops->getxattr, + &tmploc, GF_XATTR_PATHINFO_KEY, NULL); +} + static void afr_lookup_handle_success (afr_local_t *local, xlator_t *this, int32_t child_index, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xattr, struct iatt *postparent) { + afr_private_t *priv = this->private; + if (local->success_count == 0) { if (local->op_errno != ESTALE) { local->op_ret = op_ret; @@ -1858,6 +2321,11 @@ afr_lookup_handle_success (afr_local_t *local, xlator_t *this, int32_t child_ind afr_lookup_cache_args (local, child_index, xattr, buf, postparent); + + if (local->do_discovery && (priv->read_child == (-1))) { + afr_attempt_local_discovery(this,child_index); + } + local->cont.lookup.success_children[local->success_count] = child_index; local->success_count++; } @@ -1904,6 +2372,8 @@ afr_lookup_cont_init (afr_local_t *local, unsigned int child_count) int ret = -ENOMEM; struct iatt *iatts = NULL; int32_t *success_children = NULL; + int32_t *sources = NULL; + int32_t **pending_matrix = NULL; GF_ASSERT (local); local->cont.lookup.xattrs = GF_CALLOC (child_count, @@ -1931,6 +2401,16 @@ afr_lookup_cont_init (afr_local_t *local, unsigned int child_count) if (NULL == local->fresh_children) goto out; + sources = GF_CALLOC (sizeof (*sources), child_count, gf_afr_mt_int32_t); + if (NULL == sources) + goto out; + local->cont.lookup.sources = sources; + + pending_matrix = afr_matrix_create (child_count, child_count); + if (NULL == pending_matrix) + goto out; + local->cont.lookup.pending_matrix = pending_matrix; + ret = 0; out: return ret; @@ -1948,37 +2428,51 @@ afr_lookup (call_frame_t *frame, xlator_t *this, int call_count = 0; uint64_t ctx = 0; int32_t op_errno = 0; - + int allow_sh = 0; priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (local, out); local->op_ret = -1; frame->local = local; local->fop = GF_FOP_LOOKUP; - if (!strcmp (loc->path, "/" GF_REPLICATE_TRASH_DIR)) { - op_errno = ENOENT; + loc_copy (&local->loc, loc); + ret = loc_path (&local->loc, NULL); + if (ret < 0) { + op_errno = EINVAL; goto out; } - loc_copy (&local->loc, loc); + if (local->loc.path && + (strcmp (local->loc.path, "/" GF_REPLICATE_TRASH_DIR) == 0)) { + op_errno = EPERM; + ret = -1; + goto out; + } - ret = inode_ctx_get (loc->inode, this, &ctx); + ret = inode_ctx_get (local->loc.inode, this, &ctx); if (ret == 0) { /* lookup is a revalidate */ local->read_child_index = afr_inode_get_read_ctx (this, - loc->inode, - NULL); + local->loc.inode, + NULL); } else { LOCK (&priv->read_child_lock); { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); + if (priv->hash_mode) { + local->read_child_index = -1; + } + else { + local->read_child_index = + (++priv->read_child_rr) % + (priv->child_count); + } } UNLOCK (&priv->read_child_lock); + local->cont.lookup.fresh_lookup = _gf_true; } local->child_up = memdup (priv->child_up, @@ -2006,24 +2500,33 @@ afr_lookup (call_frame_t *frame, xlator_t *this, /* By default assume ENOTCONN. On success it will be set to 0. */ local->op_errno = ENOTCONN; - local->call_count = afr_up_children_count (local->child_up, - priv->child_count); - ret = afr_lookup_xattr_req_prepare (local, this, xattr_req, loc, + ret = dict_get_int32 (xattr_req, "allow-sh-for-running-transaction", + &allow_sh); + dict_del (xattr_req, "allow-sh-for-running-transaction"); + local->allow_sh_for_running_transaction = allow_sh; + + ret = afr_lookup_xattr_req_prepare (local, this, xattr_req, &local->loc, &gfid_req); if (ret) { local->op_errno = -ret; goto out; } afr_lookup_save_gfid (local->cont.lookup.gfid_req, gfid_req, - loc->inode); + &local->loc); local->fop = GF_FOP_LOOKUP; + if (priv->choose_local && !priv->did_discovery) { + if (gfid_req && __is_root_gfid(gfid_req)) { + local->do_discovery = _gf_true; + priv->did_discovery = _gf_true; + } + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, afr_lookup_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->lookup, - loc, local->xattr_req); + &local->loc, local->xattr_req); if (!--call_count) break; } @@ -2042,7 +2545,7 @@ out: /* {{{ open */ int -afr_fd_ctx_set (xlator_t *this, fd_t *fd) +__afr_fd_ctx_set (xlator_t *this, fd_t *fd) { afr_private_t * priv = NULL; int ret = -1; @@ -2054,219 +2557,167 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd) priv = this->private; - LOCK (&fd->lock); - { - ret = __fd_ctx_get (fd, this, &ctx); + ret = __fd_ctx_get (fd, this, &ctx); - if (ret == 0) - goto unlock; + if (ret == 0) + goto out; - fd_ctx = GF_CALLOC (1, sizeof (afr_fd_ctx_t), - gf_afr_mt_afr_fd_ctx_t); - if (!fd_ctx) { - ret = -ENOMEM; - goto unlock; - } + fd_ctx = GF_CALLOC (1, sizeof (afr_fd_ctx_t), + gf_afr_mt_afr_fd_ctx_t); + if (!fd_ctx) { + ret = -ENOMEM; + goto out; + } - fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_done) { - ret = -ENOMEM; - goto unlock; - } + fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->pre_op_done) { + ret = -ENOMEM; + goto out; + } - fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_piggyback) { - ret = -ENOMEM; - goto unlock; - } + fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->pre_op_piggyback) { + ret = -ENOMEM; + goto out; + } - fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), - priv->child_count, - gf_afr_mt_int32_t); - if (!fd_ctx->opened_on) { - ret = -ENOMEM; - goto unlock; - } + fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), + priv->child_count, + gf_afr_mt_int32_t); + if (!fd_ctx->opened_on) { + ret = -ENOMEM; + goto out; + } - fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->lock_piggyback) { - ret = -ENOMEM; - goto unlock; - } + fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->lock_piggyback) { + ret = -ENOMEM; + goto out; + } - fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->lock_acquired) { - ret = -ENOMEM; - goto unlock; - } + fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->lock_acquired) { + ret = -ENOMEM; + goto out; + } - fd_ctx->up_count = priv->up_count; - fd_ctx->down_count = priv->down_count; + fd_ctx->up_count = priv->up_count; + fd_ctx->down_count = priv->down_count; - fd_ctx->locked_on = GF_CALLOC (sizeof (*fd_ctx->locked_on), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->locked_on) { - ret = -ENOMEM; - goto unlock; - } + fd_ctx->locked_on = GF_CALLOC (sizeof (*fd_ctx->locked_on), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->locked_on) { + ret = -ENOMEM; + goto out; + } - INIT_LIST_HEAD (&fd_ctx->paused_calls); - INIT_LIST_HEAD (&fd_ctx->entries); + pthread_mutex_init (&fd_ctx->delay_lock, NULL); + INIT_LIST_HEAD (&fd_ctx->entries); + fd_ctx->call_child = -1; - ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set fd ctx (%p)", fd); - } -unlock: - UNLOCK (&fd->lock); + INIT_LIST_HEAD (&fd_ctx->eager_locked); + + ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); + if (ret) + gf_log (this->name, GF_LOG_DEBUG, + "failed to set fd ctx (%p)", fd); out: return ret; } -/* {{{ flush */ int -afr_flush_unwind (call_frame_t *frame, xlator_t *this) +afr_fd_ctx_set (xlator_t *this, fd_t *fd) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; + int ret = -1; - LOCK (&frame->lock); + LOCK (&fd->lock); { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (flush, main_frame, - local->op_ret, local->op_errno); + ret = __afr_fd_ctx_set (this, fd); } + UNLOCK (&fd->lock); - return 0; + return ret; } +/* {{{ flush */ int -afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t * local = NULL; - afr_private_t * priv = NULL; int call_count = -1; - int child_index = (long) cookie; - int need_unwind = 0; local = frame->local; - priv = this->private; LOCK (&frame->lock); { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - if (op_ret != -1) { if (local->success_count == 0) { local->op_ret = op_ret; } local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } } local->op_errno = op_errno; } UNLOCK (&frame->lock); - if (need_unwind) - afr_flush_unwind (frame, this); + call_count = afr_frame_return (frame); - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } + if (call_count == 0) + AFR_STACK_UNWIND(flush, frame, local->op_ret, + local->op_errno, NULL); return 0; } - -int -afr_flush_wind (call_frame_t *frame, xlator_t *this) +static int +afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int i = 0; - int call_count = -1; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; - local = frame->local; priv = this->private; - - call_count = afr_up_children_count (local->child_up, priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; + local = frame->local; + call_count = local->call_count; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_flush_wind_cbk, + STACK_WIND_COOKIE (frame, afr_flush_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->flush, - local->fd); - + local->fd, NULL); if (!--call_count) break; + } } return 0; } - -int -afr_flush_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - - int -afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_stub_t *stub = NULL; + int ret = -1; + int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -2274,51 +2725,27 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + ret = afr_local_init(local, priv, &op_errno); + if (ret < 0) + goto out; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { + local->fd = fd_ref(fd); + stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata); + if (!stub) { + ret = -1; op_errno = ENOMEM; goto out; } - transaction_frame->local = local; - - local->op = GF_FOP_FLUSH; - - local->transaction.fop = afr_flush_wind; - local->transaction.done = afr_flush_done; - local->transaction.unwind = afr_flush_unwind; - - local->fd = fd_ref (fd); - - local->transaction.main_frame = frame; - local->transaction.start = 0; - local->transaction.len = 0; - - ret = afr_open_fd_fix (transaction_frame, this, _gf_false); - if (ret) { - op_ret = -1; - op_errno = -ret; - goto out; - } - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - + afr_delayed_changelog_wake_resume (this, fd, stub); + ret = 0; - op_ret = 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - - AFR_STACK_UNWIND (flush, frame, op_ret, op_errno); - } + if (ret < 0) + AFR_STACK_UNWIND(flush, frame, -1, op_errno, NULL); return 0; } @@ -2332,8 +2759,6 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) uint64_t ctx = 0; afr_fd_ctx_t *fd_ctx = NULL; int ret = 0; - afr_fd_paused_call_t *paused_call = NULL; - afr_fd_paused_call_t *tmp = NULL; ret = fd_ctx_get (fd, this, &ctx); if (ret < 0) @@ -2342,28 +2767,18 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) fd_ctx = (afr_fd_ctx_t *)(long) ctx; if (fd_ctx) { - if (fd_ctx->pre_op_done) - GF_FREE (fd_ctx->pre_op_done); + GF_FREE (fd_ctx->pre_op_done); - if (fd_ctx->opened_on) - GF_FREE (fd_ctx->opened_on); + GF_FREE (fd_ctx->opened_on); - if (fd_ctx->locked_on) - GF_FREE (fd_ctx->locked_on); + GF_FREE (fd_ctx->locked_on); - if (fd_ctx->pre_op_piggyback) - GF_FREE (fd_ctx->pre_op_piggyback); - list_for_each_entry_safe (paused_call, tmp, &fd_ctx->paused_calls, - call_list) { - list_del_init (&paused_call->call_list); - GF_FREE (paused_call); - } + GF_FREE (fd_ctx->pre_op_piggyback); + GF_FREE (fd_ctx->lock_piggyback); - if (fd_ctx->lock_piggyback) - GF_FREE (fd_ctx->lock_piggyback); + GF_FREE (fd_ctx->lock_acquired); - if (fd_ctx->lock_acquired) - GF_FREE (fd_ctx->lock_acquired); + pthread_mutex_destroy (&fd_ctx->delay_lock); GF_FREE (fd_ctx); } @@ -2401,14 +2816,25 @@ afr_release (xlator_t *this, fd_t *fd) /* {{{ fsync */ int +afr_fsync_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} + +int afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { afr_local_t *local = NULL; int call_count = -1; int child_index = (long) cookie; int read_child = 0; + call_stub_t *stub = NULL; local = frame->local; @@ -2424,13 +2850,13 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_ret = 0; if (local->success_count == 0) { - local->cont.fsync.prebuf = *prebuf; - local->cont.fsync.postbuf = *postbuf; + local->cont.inode_wfop.prebuf = *prebuf; + local->cont.inode_wfop.postbuf = *postbuf; } if (child_index == read_child) { - local->cont.fsync.prebuf = *prebuf; - local->cont.fsync.postbuf = *postbuf; + local->cont.inode_wfop.prebuf = *prebuf; + local->cont.inode_wfop.postbuf = *postbuf; } local->success_count++; @@ -2443,9 +2869,32 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { - AFR_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno, - &local->cont.fsync.prebuf, - &local->cont.fsync.postbuf); + /* Make a stub out of the frame, and register it + with the waking up post-op. When the call-stub resumes, + we are guaranteed that there was no post-op pending + (i.e changelogs were unset in the server). This is an + essential "guarantee", that fsync() returns only after + completely finishing EVERYTHING, including the delayed + post-op. This guarantee is expected by FUSE graph switching + for example. + */ + stub = fop_fsync_cbk_stub (frame, afr_fsync_unwind_cbk, + local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + xdata); + if (!stub) { + AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); + return 0; + } + + /* If no new unstable writes happened between the + time we cleared the unstable write witness flag in afr_fsync + and now, calling afr_delayed_changelog_wake_up() should + wake up and skip over the fsync phase and go straight to + afr_changelog_post_op_now() + */ + afr_delayed_changelog_wake_resume (this, local->fd, stub); } return 0; @@ -2454,14 +2903,13 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync) + int32_t datasync, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -2470,36 +2918,37 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; local->fd = fd_ref (fd); + if (afr_fd_has_witnessed_unstable_write (this, fd)) { + /* don't care. we only wanted to CLEAR the bit */ + } + for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, afr_fsync_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->fsync, - fd, datasync); + fd, datasync, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, NULL, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -2509,7 +2958,8 @@ out: int32_t afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xdata) { afr_local_t *local = NULL; int call_count = -1; @@ -2529,7 +2979,7 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } @@ -2537,14 +2987,13 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, int32_t afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync) + int32_t datasync, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -2553,33 +3002,30 @@ afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_fsyncdir_cbk, priv->children[i], priv->children[i]->fops->fsyncdir, - fd, datasync); + fd, datasync, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fsyncdir, frame, op_ret, op_errno); - } + if (ret < 0) + AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); return 0; } @@ -2590,7 +3036,7 @@ out: int32_t afr_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) + dict_t *xattr, dict_t *xdata) { afr_local_t *local = NULL; int call_count = -1; @@ -2599,8 +3045,11 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, LOCK (&frame->lock); { - if (op_ret == 0) + if (op_ret == 0) { + if (!local->cont.xattrop.xattr) + local->cont.xattrop.xattr = dict_ref (xattr); local->op_ret = 0; + } local->op_errno = op_errno; } @@ -2610,7 +3059,7 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (xattrop, frame, local->op_ret, local->op_errno, - xattr); + local->cont.xattrop.xattr, xdata); return 0; } @@ -2618,14 +3067,13 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, int32_t afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t optype, dict_t *xattr) + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -2634,33 +3082,30 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_xattrop_cbk, priv->children[i], priv->children[i]->fops->xattrop, - loc, optype, xattr); + loc, optype, xattr, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (xattrop, frame, op_ret, op_errno, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); return 0; } @@ -2671,7 +3116,7 @@ out: int32_t afr_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) + dict_t *xattr, dict_t *xdata) { afr_local_t *local = NULL; @@ -2681,8 +3126,12 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, LOCK (&frame->lock); { - if (op_ret == 0) + if (op_ret == 0) { + if (!local->cont.fxattrop.xattr) + local->cont.fxattrop.xattr = dict_ref (xattr); + local->op_ret = 0; + } local->op_errno = op_errno; } @@ -2692,7 +3141,7 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fxattrop, frame, local->op_ret, local->op_errno, - xattr); + local->cont.fxattrop.xattr, xdata); return 0; } @@ -2700,14 +3149,13 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, int32_t afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_xattrop_flags_t optype, dict_t *xattr) + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -2716,33 +3164,30 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_fxattrop_cbk, priv->children[i], priv->children[i]->fops->fxattrop, - fd, optype, xattr); + fd, optype, xattr, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); return 0; } @@ -2751,7 +3196,7 @@ out: int32_t afr_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -2772,7 +3217,7 @@ afr_inodelk_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (inodelk, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } @@ -2780,14 +3225,14 @@ afr_inodelk_cbk (call_frame_t *frame, void *cookie, int32_t afr_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock) + const char *volume, loc_t *loc, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -2796,41 +3241,39 @@ afr_inodelk (call_frame_t *frame, xlator_t *this, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_inodelk_cbk, priv->children[i], priv->children[i]->fops->inodelk, - volume, loc, cmd, flock); + volume, loc, cmd, flock, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (inodelk, frame, op_ret, op_errno); - } + if (ret < 0) + AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); return 0; } int32_t afr_finodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xdata) { afr_local_t *local = NULL; @@ -2851,7 +3294,7 @@ afr_finodelk_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (finodelk, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } @@ -2859,14 +3302,14 @@ afr_finodelk_cbk (call_frame_t *frame, void *cookie, int32_t afr_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock) + const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -2875,42 +3318,38 @@ afr_finodelk (call_frame_t *frame, xlator_t *this, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_finodelk_cbk, priv->children[i], priv->children[i]->fops->finodelk, - volume, fd, cmd, flock); + volume, fd, cmd, flock, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (finodelk, frame, op_ret, op_errno); - } + if (ret < 0) + AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); return 0; } int32_t -afr_entrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - +afr_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; int call_count = -1; @@ -2930,7 +3369,7 @@ afr_entrylk_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (entrylk, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } @@ -2939,14 +3378,14 @@ afr_entrylk_cbk (call_frame_t *frame, void *cookie, int32_t afr_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, - const char *basename, entrylk_cmd cmd, entrylk_type type) + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -2955,34 +3394,31 @@ afr_entrylk (call_frame_t *frame, xlator_t *this, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_entrylk_cbk, priv->children[i], priv->children[i]->fops->entrylk, - volume, loc, basename, cmd, type); + volume, loc, basename, cmd, type, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (entrylk, frame, op_ret, op_errno); - } + if (ret < 0) + AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); return 0; } @@ -2990,7 +3426,7 @@ out: int32_t afr_fentrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -3011,7 +3447,7 @@ afr_fentrylk_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fentrylk, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } @@ -3020,14 +3456,14 @@ afr_fentrylk_cbk (call_frame_t *frame, void *cookie, int32_t afr_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, - const char *basename, entrylk_cmd cmd, entrylk_type type) + const char *basename, entrylk_cmd cmd, + entrylk_type type, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -3036,41 +3472,38 @@ afr_fentrylk (call_frame_t *frame, xlator_t *this, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_fentrylk_cbk, priv->children[i], priv->children[i]->fops->fentrylk, - volume, fd, basename, cmd, type); + volume, fd, basename, cmd, type, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fentrylk, frame, op_ret, op_errno); - } + if (ret < 0) + AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); return 0; } int32_t afr_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct statvfs *statvfs) + struct statvfs *statvfs, dict_t *xdata) { afr_local_t *local = NULL; int call_count = 0; @@ -3101,7 +3534,7 @@ afr_statfs_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno, - &local->cont.statfs.buf); + &local->cont.statfs.buf, xdata); return 0; } @@ -3109,7 +3542,7 @@ afr_statfs_cbk (call_frame_t *frame, void *cookie, int32_t afr_statfs (call_frame_t *frame, xlator_t *this, - loc_t *loc) + loc_t *loc, dict_t *xdata) { afr_private_t * priv = NULL; int child_count = 0; @@ -3117,7 +3550,6 @@ afr_statfs (call_frame_t *frame, xlator_t *this, int i = 0; int ret = -1; int call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (this, out); @@ -3127,15 +3559,13 @@ afr_statfs (call_frame_t *frame, xlator_t *this, priv = this->private; child_count = priv->child_count; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - frame->local = local; call_count = local->call_count; for (i = 0; i < child_count; i++) { @@ -3143,24 +3573,24 @@ afr_statfs (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, afr_statfs_cbk, priv->children[i], priv->children[i]->fops->statfs, - loc); + loc, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (statfs, frame, op_ret, op_errno, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); return 0; } int32_t afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) { afr_local_t * local = NULL; int call_count = -1; @@ -3170,7 +3600,7 @@ afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (call_count == 0) AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, - lock); + lock, xdata); return 0; } @@ -3192,7 +3622,7 @@ afr_lk_unlock (call_frame_t *frame, xlator_t *this) if (call_count == 0) { AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, - &local->cont.lk.ret_flock); + &local->cont.lk.ret_flock, NULL); return 0; } @@ -3206,7 +3636,7 @@ afr_lk_unlock (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->lk, local->fd, F_SETLK, - &local->cont.lk.user_flock); + &local->cont.lk.user_flock, NULL); if (!--call_count) break; @@ -3219,7 +3649,7 @@ afr_lk_unlock (call_frame_t *frame, xlator_t *this) int32_t afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata) { afr_local_t *local = NULL; afr_private_t *priv = NULL; @@ -3254,12 +3684,12 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, priv->children[child_index], priv->children[child_index]->fops->lk, local->fd, local->cont.lk.cmd, - &local->cont.lk.user_flock); + &local->cont.lk.user_flock, xdata); } else if (local->op_ret == -1) { /* all nodes have gone down */ AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN, - &local->cont.lk.ret_flock); + &local->cont.lk.ret_flock, NULL); } else { /* locking has succeeded on all nodes that are up */ @@ -3277,7 +3707,7 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, */ AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, - &local->cont.lk.ret_flock); + &local->cont.lk.ret_flock, NULL); } return 0; @@ -3286,13 +3716,13 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int afr_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t cmd, struct gf_flock *flock) + fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; int i = 0; - int32_t op_ret = -1; int32_t op_errno = 0; + int ret = -1; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -3300,10 +3730,12 @@ afr_lk (call_frame_t *frame, xlator_t *this, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - AFR_LOCAL_INIT (local, priv); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - frame->local = local; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count, sizeof (*local->cont.lk.locked_nodes), @@ -3322,13 +3754,12 @@ afr_lk (call_frame_t *frame, xlator_t *this, STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0, priv->children[i], priv->children[i]->fops->lk, - fd, cmd, flock); + fd, cmd, flock, xdata); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (lk, frame, op_ret, op_errno, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); return 0; } @@ -3344,8 +3775,7 @@ afr_forget (xlator_t *this, inode_t *inode) goto out; ctx = (afr_inode_ctx_t *)(long)ctx_addr; - if (ctx->fresh_children) - GF_FREE (ctx->fresh_children); + GF_FREE (ctx->fresh_children); GF_FREE (ctx); out: return 0; @@ -3382,11 +3812,6 @@ afr_priv_dump (xlator_t *this) gf_proc_dump_write("entry-change_log", "%d", priv->entry_change_log); gf_proc_dump_write("read_child", "%d", priv->read_child); gf_proc_dump_write("favorite_child", "%d", priv->favorite_child); - gf_proc_dump_write("data_lock_server_count", "%u", priv->data_lock_server_count); - gf_proc_dump_write("metadata_lock_server_count", "%u", - priv->metadata_lock_server_count); - gf_proc_dump_write("entry_lock_server_count", "%u", - priv->entry_lock_server_count); gf_proc_dump_write("wait_count", "%u", priv->wait_count); return 0; @@ -3417,7 +3842,7 @@ find_child_index (xlator_t *this, xlator_t *child) int32_t afr_notify (xlator_t *this, int32_t event, - void *data, ...) + void *data, void *data2) { afr_private_t *priv = NULL; int i = -1; @@ -3430,12 +3855,22 @@ afr_notify (xlator_t *this, int32_t event, int ret = -1; int call_psh = 0; int up_child = AFR_ALL_CHILDREN; + dict_t *input = NULL; + dict_t *output = NULL; priv = this->private; if (!priv) return 0; + /* + * We need to reset this in case children come up in "staggered" + * fashion, so that we discover a late-arriving local subvolume. Note + * that we could end up issuing N lookups to the first subvolume, and + * O(N^2) overall, but N is small for AFR so it shouldn't be an issue. + */ + priv->did_discovery = _gf_false; + had_heard_from_all = 1; for (i = 0; i < priv->child_count; i++) { if (!priv->last_event[i]) { @@ -3446,7 +3881,7 @@ afr_notify (xlator_t *this, int32_t event, /* parent xlators dont need to know about every child_up, child_down * because of afr ha. If all subvolumes go down, child_down has * to be triggered. In that state when 1 subvolume comes up child_up - * needs to be triggered. dht optimises revalidate lookup by sending + * needs to be triggered. dht optimizes revalidate lookup by sending * it only to one of its subvolumes. When child up/down happens * for afr's subvolumes dht should be notified by child_modified. The * subsequent revalidate lookup happens on all the dht's subvolumes @@ -3463,9 +3898,19 @@ afr_notify (xlator_t *this, int32_t event, case GF_EVENT_CHILD_UP: LOCK (&priv->lock); { + /* + * This only really counts if the child was never up + * (value = -1) or had been down (value = 0). See + * comment at GF_EVENT_CHILD_DOWN for a more detailed + * explanation. + */ + if (priv->child_up[idx] != 1) { + priv->up_count++; + } priv->child_up[idx] = 1; - priv->up_count++; + call_psh = 1; + up_child = idx; for (i = 0; i < priv->child_count; i++) if (priv->child_up[i] == 1) up_children++; @@ -3475,12 +3920,6 @@ afr_notify (xlator_t *this, int32_t event, "going online.", ((xlator_t *)data)->name); } else { event = GF_EVENT_CHILD_MODIFIED; - gf_log (this->name, GF_LOG_INFO, "subvol %d came up, " - "start crawl", idx); - if (had_heard_from_all) { - call_psh = 1; - up_child = idx; - } } priv->last_event[idx] = event; @@ -3492,8 +3931,22 @@ afr_notify (xlator_t *this, int32_t event, case GF_EVENT_CHILD_DOWN: LOCK (&priv->lock); { + /* + * If a brick is down when we start, we'll get a + * CHILD_DOWN to indicate its initial state. There + * was never a CHILD_UP in this case, so if we + * increment "down_count" the difference between than + * and "up_count" will no longer be the number of + * children that are currently up. This has serious + * implications e.g. for quorum enforcement, so we + * don't increment these values unless the event + * represents an actual state transition between "up" + * (value = 1) and anything else. + */ + if (priv->child_up[idx] == 1) { + priv->down_count++; + } priv->child_up[idx] = 0; - priv->down_count++; for (i = 0; i < priv->child_count; i++) if (priv->child_up[i] == 0) @@ -3521,10 +3974,11 @@ afr_notify (xlator_t *this, int32_t event, break; - case GF_EVENT_TRIGGER_HEAL: - gf_log (this->name, GF_LOG_INFO, "Self-heal was triggered" - " manually. Start crawling"); - call_psh = 1; + case GF_EVENT_TRANSLATOR_OP: + input = data; + output = data2; + ret = afr_xl_op (this, input, output); + goto out; break; default: @@ -3569,18 +4023,13 @@ afr_notify (xlator_t *this, int32_t event, } } UNLOCK (&priv->lock); - if (up_children > 1) { - gf_log (this->name, GF_LOG_INFO, "All subvolumes came " - "up, start crawl"); - call_psh = 1; - } } ret = 0; if (propagate) ret = default_notify (this, event, data); - if (call_psh) - afr_proactive_self_heal (this, up_child); + if (call_psh && priv->shd.iamshd) + afr_proactive_self_heal ((void*) (long) up_child); out: return ret; @@ -3605,29 +4054,56 @@ afr_first_up_child (unsigned char *child_up, size_t child_count) } int -AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv) +afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) { + int ret = -1; + local->op_ret = -1; local->op_errno = EUCLEAN; - local->call_count = afr_up_children_count (priv->child_up, + + local->child_up = GF_CALLOC (priv->child_count, + sizeof (*local->child_up), + gf_afr_mt_char); + if (!local->child_up) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + memcpy (local->child_up, priv->child_up, + sizeof (*local->child_up) * priv->child_count); + local->call_count = afr_up_children_count (local->child_up, priv->child_count); if (local->call_count == 0) { gf_log (THIS->name, GF_LOG_INFO, "no subvolumes up"); - return -ENOTCONN; + if (op_errno) + *op_errno = ENOTCONN; + goto out; } + local->child_errno = GF_CALLOC (priv->child_count, + sizeof (*local->child_errno), + gf_afr_mt_int32_t); + if (!local->child_errno) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } - local->child_up = GF_CALLOC (sizeof (*local->child_up), - priv->child_count, - gf_afr_mt_char); - if (!local->child_up) { - return -ENOMEM; + local->transaction.postop_piggybacked = GF_CALLOC (priv->child_count, + sizeof (int), + gf_afr_mt_int32_t); + if (!local->transaction.postop_piggybacked) { + if (op_errno) + *op_errno = ENOMEM; + goto out; } - memcpy (local->child_up, priv->child_up, - sizeof (*local->child_up) * priv->child_count); + local->append_write = _gf_false; - return 0; + ret = 0; +out: + return ret; } int @@ -3636,16 +4112,6 @@ afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, { int ret = -ENOMEM; - lk->inode_locked_nodes = GF_CALLOC (sizeof (*lk->inode_locked_nodes), - child_count, gf_afr_mt_char); - if (NULL == lk->inode_locked_nodes) - goto out; - - lk->entry_locked_nodes = GF_CALLOC (sizeof (*lk->entry_locked_nodes), - child_count, gf_afr_mt_char); - if (NULL == lk->entry_locked_nodes) - goto out; - lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), child_count, gf_afr_mt_char); if (NULL == lk->locked_nodes) @@ -3665,10 +4131,62 @@ out: return ret; } +void +afr_matrix_cleanup (int32_t **matrix, unsigned int m) +{ + int i = 0; + + if (!matrix) + goto out; + for (i = 0; i < m; i++) { + GF_FREE (matrix[i]); + } + + GF_FREE (matrix); +out: + return; +} + +int32_t** +afr_matrix_create (unsigned int m, unsigned int n) +{ + int32_t **matrix = NULL; + int i = 0; + + matrix = GF_CALLOC (sizeof (*matrix), m, gf_afr_mt_int32_t); + if (!matrix) + goto out; + + for (i = 0; i < m; i++) { + matrix[i] = GF_CALLOC (sizeof (*matrix[i]), n, + gf_afr_mt_int32_t); + if (!matrix[i]) + goto out; + } + return matrix; +out: + afr_matrix_cleanup (matrix, m); + return NULL; +} + +int +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count) +{ + int ret = -ENOMEM; + + lk->domain = dom; + lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), + child_count, gf_afr_mt_char); + if (NULL == lk->locked_nodes) + goto out; + ret = 0; +out: + return ret; +} + int afr_transaction_local_init (afr_local_t *local, xlator_t *this) { - int i = 0; int child_up_count = 0; int ret = -ENOMEM; afr_private_t *priv = NULL; @@ -3679,6 +4197,14 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) if (ret < 0) goto out; + if ((local->transaction.type == AFR_DATA_TRANSACTION) || + (local->transaction.type == AFR_METADATA_TRANSACTION)) { + ret = afr_inodelk_init (&local->internal_lock.inodelk[0], + this->name, priv->child_count); + if (ret < 0) + goto out; + } + ret = -ENOMEM; child_up_count = afr_up_children_count (local->child_up, priv->child_count); @@ -3688,12 +4214,6 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) local->first_up_child = afr_first_up_child (local->child_up, priv->child_count); - local->child_errno = GF_CALLOC (sizeof (*local->child_errno), - priv->child_count, - gf_afr_mt_int32_t); - if (!local->child_errno) - goto out; - local->transaction.eager_lock = GF_CALLOC (sizeof (*local->transaction.eager_lock), priv->child_count, @@ -3702,44 +4222,27 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) if (!local->transaction.eager_lock) goto out; - local->pending = GF_CALLOC (sizeof (*local->pending), - priv->child_count, - gf_afr_mt_int32_t); - - if (!local->pending) - goto out; - local->fresh_children = afr_children_create (priv->child_count); if (!local->fresh_children) goto out; - if (local->fd) { - local->fd_open_on = GF_CALLOC (sizeof (*local->fd_open_on), - priv->child_count, - gf_afr_mt_char); - if (!local->fd_open_on) - goto out; - } - local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op), priv->child_count, gf_afr_mt_char); if (!local->transaction.pre_op) goto out; - for (i = 0; i < priv->child_count; i++) { - local->pending[i] = GF_CALLOC (sizeof (*local->pending[i]), - 3, /* data + metadata + entry */ - gf_afr_mt_int32_t); - if (!local->pending[i]) - goto out; - } + local->pending = afr_matrix_create (priv->child_count, + AFR_NUM_CHANGE_LOGS); + if (!local->pending) + goto out; - local->transaction.child_errno = - GF_CALLOC (sizeof (*local->transaction.child_errno), - priv->child_count, - gf_afr_mt_int32_t); - local->transaction.erase_pending = 1; + local->transaction.txn_changelog = afr_matrix_create (priv->child_count, + AFR_NUM_CHANGE_LOGS); + if (!local->transaction.txn_changelog) + goto out; + + INIT_LIST_HEAD (&local->transaction.eager_locked); ret = 0; out: @@ -3835,7 +4338,7 @@ afr_set_low_priority (call_frame_t *frame) int afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child, - int flags, int32_t wbflags) + int flags) { int ret = 0; uint64_t ctx = 0; @@ -3860,9 +4363,229 @@ afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child, fd_ctx->opened_on[child] = AFR_FD_OPENED; if (!IA_ISDIR (fd->inode->ia_type)) { fd_ctx->flags = flags; - fd_ctx->wbflags = wbflags; } ret = 0; out: return ret; } + +gf_boolean_t +afr_have_quorum (char *logname, afr_private_t *priv) +{ + unsigned int quorum = 0; + + GF_VALIDATE_OR_GOTO(logname,priv,out); + + quorum = priv->quorum_count; + if (quorum != AFR_QUORUM_AUTO) { + return (priv->up_count >= (priv->down_count + quorum)); + } + + quorum = priv->child_count / 2 + 1; + if (priv->up_count >= (priv->down_count + quorum)) { + return _gf_true; + } + + /* + * Special case for even numbers of nodes: if we have exactly half + * and that includes the first ("senior-most") node, then that counts + * as quorum even if it wouldn't otherwise. This supports e.g. N=2 + * while preserving the critical property that there can only be one + * such group. + */ + if ((priv->child_count % 2) == 0) { + quorum = priv->child_count / 2; + if (priv->up_count >= (priv->down_count + quorum)) { + if (priv->child_up[0]) { + return _gf_true; + } + } + } + +out: + return _gf_false; +} + +void +afr_priv_destroy (afr_private_t *priv) +{ + int i = 0; + + if (!priv) + goto out; + inode_unref (priv->root_inode); + GF_FREE (priv->shd.pos); + GF_FREE (priv->shd.pending); + GF_FREE (priv->shd.inprogress); +// for (i = 0; i < priv->child_count; i++) +// if (priv->shd.timer && priv->shd.timer[i]) +// gf_timer_call_cancel (this->ctx, priv->shd.timer[i]); + GF_FREE (priv->shd.timer); + + if (priv->shd.healed) + eh_destroy (priv->shd.healed); + + if (priv->shd.heal_failed) + eh_destroy (priv->shd.heal_failed); + + if (priv->shd.split_brain) + eh_destroy (priv->shd.split_brain); + + for (i = 0; i < priv->child_count; i++) + { + if (priv->shd.statistics[i]) + eh_destroy (priv->shd.statistics[i]); + } + + GF_FREE (priv->shd.statistics); + + GF_FREE (priv->shd.crawl_events); + + GF_FREE (priv->last_event); + if (priv->pending_key) { + for (i = 0; i < priv->child_count; i++) + GF_FREE (priv->pending_key[i]); + } + GF_FREE (priv->pending_key); + GF_FREE (priv->children); + GF_FREE (priv->child_up); + LOCK_DESTROY (&priv->lock); + LOCK_DESTROY (&priv->read_child_lock); + pthread_mutex_destroy (&priv->mutex); + GF_FREE (priv); +out: + return; +} + +int +xlator_subvolume_count (xlator_t *this) +{ + int i = 0; + xlator_list_t *list = NULL; + + for (list = this->children; list; list = list->next) + i++; + return i; +} + +inline gf_boolean_t +afr_is_errno_set (int *child_errno, int child) +{ + return child_errno[child]; +} + +inline gf_boolean_t +afr_is_errno_unset (int *child_errno, int child) +{ + return !afr_is_errno_set (child_errno, child); +} + +void +afr_prepare_new_entry_pending_matrix (int32_t **pending, + gf_boolean_t (*is_pending) (int *, int), + int *ctx, struct iatt *buf, + unsigned int child_count) +{ + int midx = 0; + int idx = 0; + int i = 0; + + midx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); + if (IA_ISDIR (buf->ia_type)) + idx = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); + else if (IA_ISREG (buf->ia_type)) + idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); + else + idx = -1; + for (i = 0; i < child_count; i++) { + if (is_pending (ctx, i)) { + pending[i][midx] = hton32 (1); + if (idx == -1) + continue; + pending[i][idx] = hton32 (1); + } + } +} + +gf_boolean_t +afr_is_fd_fixable (fd_t *fd) +{ + if (!fd || !fd->inode) + return _gf_false; + else if (fd_is_anonymous (fd)) + return _gf_false; + else if (uuid_is_null (fd->inode->gfid)) + return _gf_false; + + return _gf_true; +} + +void +afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + inode_t *inode = NULL; + afr_inode_ctx_t *ctx = NULL; + + local = frame->local; + + if (local->fd) + inode = local->fd->inode; + else + inode = local->loc.inode; + + if (!inode) + return; + + LOCK (&inode->lock); + { + ctx = __afr_inode_ctx_get (inode, this); + ctx->open_fd_count = local->open_fd_count; + } + UNLOCK (&inode->lock); +} + +int +afr_initialise_statistics (xlator_t *this) +{ + afr_private_t *priv = NULL; + int ret = -1; + int i = 0; + int child_count = 0; + eh_t *stats_per_brick = NULL; + shd_crawl_event_t ***shd_crawl_events = NULL; + priv = this->private; + + priv->shd.statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count, + gf_common_mt_eh_t); + if (!priv->shd.statistics) { + ret = -1; + goto out; + } + child_count = priv->child_count; + for (i=0; i < child_count ; i++) { + stats_per_brick = eh_new (AFR_STATISTICS_HISTORY_SIZE, + _gf_false, + _destroy_crawl_event_data); + if (!stats_per_brick) { + ret = -1; + goto out; + } + priv->shd.statistics[i] = stats_per_brick; + + } + + shd_crawl_events = (shd_crawl_event_t***)(&priv->shd.crawl_events); + *shd_crawl_events = GF_CALLOC (sizeof(shd_crawl_event_t*), + priv->child_count, + gf_afr_mt_shd_crawl_event_t); + + if (!priv->shd.crawl_events) { + ret = -1; + goto out; + } + ret = 0; +out: + return ret; + +} diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index f2e6760cf..689dd84e6 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -51,7 +42,7 @@ int afr_examine_dir_sh_unwind (call_frame_t *frame, xlator_t *this, int32_t op_ret, - int32_t op_errno) + int32_t op_errno, int32_t sh_failed) { afr_local_t *local = NULL; @@ -60,7 +51,7 @@ afr_examine_dir_sh_unwind (call_frame_t *frame, xlator_t *this, int32_t op_ret, afr_set_opendir_done (this, local->fd->inode); AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); + local->op_errno, local->fd, NULL); return 0; } @@ -99,7 +90,7 @@ __checksums_differ (uint32_t *checksum, int child_count, int32_t afr_examine_dir_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) + gf_dirent_t *entries, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; @@ -137,7 +128,7 @@ afr_examine_dir_readdir_cbk (call_frame_t *frame, void *cookie, } list_for_each_entry_safe (entry, tmp, &entries->list, list) { - entry_cksum = gf_rsync_weak_checksum (entry->d_name, + entry_cksum = gf_rsync_weak_checksum ((unsigned char *)entry->d_name, strlen (entry->d_name)); local->cont.opendir.checksum[child_index] ^= entry_cksum; } @@ -152,7 +143,7 @@ afr_examine_dir_readdir_cbk (call_frame_t *frame, void *cookie, (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->readdir, - local->fd, 131072, last_offset); + local->fd, 131072, last_offset, NULL); return 0; @@ -175,7 +166,7 @@ out: afr_set_opendir_done (this, inode); AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); + local->op_errno, local->fd, NULL); } } @@ -208,7 +199,7 @@ afr_examine_dir (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->readdir, - local->fd, 131072, 0); + local->fd, 131072, 0, NULL); if (!--call_count) break; @@ -222,7 +213,7 @@ afr_examine_dir (call_frame_t *frame, xlator_t *this) int32_t afr_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd) + fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -242,8 +233,7 @@ afr_opendir_cbk (call_frame_t *frame, void *cookie, { if (op_ret >= 0) { local->op_ret = op_ret; - ret = afr_child_fd_ctx_set (this, fd, child_index, - 0, 0); + ret = afr_child_fd_ctx_set (this, fd, child_index, 0); if (ret) { local->op_ret = -1; local->op_errno = -ret; @@ -263,7 +253,7 @@ unlock: goto out; if (!afr_is_opendir_done (this, local->fd->inode) && - up_children_count > 1) { + up_children_count > 1 && priv->entry_self_heal) { /* * This is the first opendir on this inode. We need @@ -272,7 +262,7 @@ unlock: * to regular entry self-heal because the readdir * call is sent only to the first subvolume, and * thus files that exist only there will never be healed - * otherwise (assuming changelog shows no anamolies). + * otherwise (assuming changelog shows no anomalies). */ gf_log (this->name, GF_LOG_TRACE, @@ -291,7 +281,7 @@ unlock: out: AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); + local->op_errno, local->fd, NULL); return 0; } @@ -307,7 +297,6 @@ afr_opendir (call_frame_t *frame, xlator_t *this, int i = 0; int ret = -1; int call_count = -1; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -318,16 +307,15 @@ afr_opendir (call_frame_t *frame, xlator_t *this, child_count = priv->child_count; - ALLOC_OR_GOTO (local, afr_local_t, out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } loc_copy (&local->loc, loc); - frame->local = local; local->fd = fd_ref (fd); call_count = local->call_count; @@ -338,18 +326,17 @@ afr_opendir (call_frame_t *frame, xlator_t *this, (void*) (long) i, priv->children[i], priv->children[i]->fops->opendir, - loc, fd); + loc, fd, NULL); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (opendir, frame, op_ret, op_errno, fd); - } + if (ret < 0) + AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL); return 0; } @@ -371,85 +358,6 @@ struct entry_name { struct list_head list; }; - -static gf_boolean_t -remembered_name (const char *name, struct list_head *entries) -{ - struct entry_name *e = NULL; - gf_boolean_t ret = _gf_false; - - list_for_each_entry (e, entries, list) { - if (!strcmp (name, e->name)) { - ret = _gf_true; - goto out; - } - } - -out: - return ret; -} - - -static void -afr_remember_entries (gf_dirent_t *entries, fd_t *fd) -{ - struct entry_name *n = NULL; - gf_dirent_t *entry = NULL; - int ret = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - ret = fd_ctx_get (fd, THIS, &ctx); - if (ret < 0) { - gf_log (THIS->name, GF_LOG_INFO, - "could not get fd ctx for fd=%p", fd); - return; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - list_for_each_entry (entry, &entries->list, list) { - n = GF_CALLOC (1, sizeof (*n), gf_afr_mt_entry_name); - n->name = gf_strdup (entry->d_name); - INIT_LIST_HEAD (&n->list); - - list_add (&n->list, &fd_ctx->entries); - } -} - - -static off_t -afr_filter_entries (gf_dirent_t *entries, fd_t *fd) -{ - gf_dirent_t *entry = NULL; - gf_dirent_t *tmp = NULL; - int ret = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - off_t offset = 0; - - ret = fd_ctx_get (fd, THIS, &ctx); - if (ret < 0) { - gf_log (THIS->name, GF_LOG_INFO, - "could not get fd ctx for fd=%p", fd); - return -1; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - offset = entry->d_off; - - if (remembered_name (entry->d_name, &fd_ctx->entries)) { - list_del (&entry->list); - GF_FREE (entry); - } - } - - return offset; -} - - static void afr_forget_entries (fd_t *fd) { @@ -475,174 +383,70 @@ afr_forget_entries (fd_t *fd) } } - -int32_t -afr_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) +static void +afr_readdir_filter_trash_dir (gf_dirent_t *entries, fd_t *fd) { - afr_local_t * local = NULL; gf_dirent_t * entry = NULL; gf_dirent_t * tmp = NULL; - local = frame->local; - - if (op_ret == -1) - goto out; - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - if ((local->fd->inode == local->fd->inode->table->root) - && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { + if (__is_root_gfid (fd->inode->gfid) && + !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { list_del_init (&entry->list); GF_FREE (entry); } } - -out: - AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries); - - return 0; } - int32_t -afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries) +afr_readdir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + gf_dirent_t *entries, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int32_t next_call_child = -1; - int ret = 0; - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; - int32_t *last_index = NULL; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - off_t offset = 0; - int32_t call_child = -1; + afr_local_t *local = NULL; - priv = this->private; - children = priv->children; + if (op_ret == -1) + goto out; local = frame->local; + afr_readdir_filter_trash_dir (entries, local->fd); - read_child = (long) cookie; - last_index = &local->cont.readdir.last_index; - fresh_children = local->fresh_children; - - /* the value of the last_index changes if afr_next_call_child is - * called. So to find the call_child of this callback use last_index - * before the next_call_child call. - */ - if (*last_index == -1) - call_child = read_child; - else - call_child = fresh_children[*last_index]; - - if (priv->strict_readdir) { - ret = fd_ctx_get (local->fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "could not get fd ctx for fd=%p", local->fd); - op_ret = -1; - op_errno = -ret; - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - if (op_ret == -1) { - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, - read_child); - if (next_call_child < 0) - goto out; - gf_log (this->name, GF_LOG_TRACE, - "starting readdir afresh on child %d, offset %"PRId64, - next_call_child, (uint64_t) 0); - - fd_ctx->failed_over = _gf_true; - - STACK_WIND_COOKIE (frame, afr_readdirp_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->readdirp, - local->fd, - local->cont.readdir.size, 0); - return 0; - } - } - - if (op_ret != -1) { - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - if ((local->fd->inode == local->fd->inode->table->root) - && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { - list_del_init (&entry->list); - GF_FREE (entry); - } - } - } - - if (priv->strict_readdir) { - if (fd_ctx->failed_over) { - if (list_empty (&entries->list)) { - gf_log (this->name, GF_LOG_DEBUG, - "no entries found"); - goto out; - } - - offset = afr_filter_entries (entries, local->fd); +out: + AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, NULL); + return 0; +} - afr_remember_entries (entries, local->fd); - if (list_empty (&entries->list)) { - /* All the entries we got were duplicate. We - shouldn't send an empty list now, because - that'll make the application stop reading. So - try to get more entries */ +int32_t +afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + afr_local_t *local = NULL; - gf_log (this->name, GF_LOG_TRACE, - "trying to fetch non-duplicate entries " - "from offset %"PRId64", child %s", - offset, children[call_child]->name); + if (op_ret == -1) + goto out; - STACK_WIND_COOKIE (frame, afr_readdirp_cbk, - (void *) (long) read_child, - children[call_child], - children[call_child]->fops->readdirp, - local->fd, local->cont.readdir.size, offset); - return 0; - } - } else { - afr_remember_entries (entries, local->fd); - } - } + local = frame->local; + afr_readdir_filter_trash_dir (entries, local->fd); out: - AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries); - + AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, NULL); return 0; } int32_t afr_do_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, int whichop) + fd_t *fd, size_t size, off_t offset, int whichop, dict_t *dict) { - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int ret = -1; - int32_t op_ret = -1; - int32_t op_errno = 0; - uint64_t read_child = 0; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = -1; + int32_t op_errno = 0; + uint64_t read_child = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -651,14 +455,12 @@ afr_do_readdir (call_frame_t *frame, xlator_t *this, priv = this->private; children = priv->children; - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } local->fresh_children = afr_children_create (priv->child_count); if (!local->fresh_children) { @@ -668,79 +470,67 @@ afr_do_readdir (call_frame_t *frame, xlator_t *this, read_child = afr_inode_get_read_ctx (this, fd->inode, local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.readdir.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; + ret = afr_get_call_child (this, local->child_up, read_child, + local->fresh_children, + &call_child, + &local->cont.readdir.last_index); + if (ret < 0) { + op_errno = -ret; goto out; } - local->fd = fd_ref (fd); - local->cont.readdir.size = size; - - if (priv->strict_readdir) { - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "could not get fd ctx for fd=%p", fd); - op_errno = -ret; - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - if (fd_ctx->last_tried != call_child) { - gf_log (this->name, GF_LOG_TRACE, - "first up child has changed from %d to %d, " - "restarting readdir from offset 0", - fd_ctx->last_tried, call_child); - - fd_ctx->failed_over = _gf_true; - offset = 0; - } + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) { + op_errno = EBADF; + goto out; + } - fd_ctx->last_tried = call_child; + if ((offset == 0) || (fd_ctx->call_child == -1)) { + fd_ctx->call_child = call_child; + } else if ((priv->readdir_failover == _gf_false) && + (call_child != fd_ctx->call_child)) { + op_errno = EBADF; + goto out; } + local->fd = fd_ref (fd); + local->cont.readdir.size = size; + local->cont.readdir.dict = (dict)? dict_ref (dict) : NULL; + if (whichop == GF_FOP_READDIR) STACK_WIND_COOKIE (frame, afr_readdir_cbk, (void *) (long) call_child, children[call_child], children[call_child]->fops->readdir, fd, - size, offset); + size, offset, dict); else STACK_WIND_COOKIE (frame, afr_readdirp_cbk, (void *) (long) call_child, children[call_child], children[call_child]->fops->readdirp, fd, - size, offset); + size, offset, dict); - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, NULL); - } + AFR_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL); return 0; } int32_t afr_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) + off_t offset, dict_t *xdata) { - afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR); + afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata); return 0; } int32_t afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) + off_t offset, dict_t *dict) { - afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP); + afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP, dict); return 0; } diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h index 6a6bc6354..09456d159 100644 --- a/xlators/cluster/afr/src/afr-dir-read.h +++ b/xlators/cluster/afr/src/afr-dir-read.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __DIR_READ_H__ @@ -23,23 +14,23 @@ int32_t afr_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd); + loc_t *loc, fd_t *fd, dict_t *xdata); int32_t afr_releasedir (xlator_t *this, fd_t *fd); int32_t afr_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset); + fd_t *fd, size_t size, off_t offset, dict_t *xdata); int32_t afr_readdirp (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset); + fd_t *fd, size_t size, off_t offset, dict_t *dict); int32_t afr_checksum (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags); + loc_t *loc, int32_t flags, dict_t *xdata); #endif /* __DIR_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 2d7f98a2a..1943b719b 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -47,33 +38,222 @@ #include "afr.h" #include "afr-transaction.h" +int +afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno) +{ + int ret = -1; + char *child_path = NULL; + + if (!child->parent) { + if (op_errno) + *op_errno = EINVAL; + goto out; + } + + child_path = gf_strdup (child->path); + if (!child_path) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + parent->path = gf_strdup( dirname (child_path) ); + if (!parent->path) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + parent->inode = inode_ref (child->parent); + uuid_copy (parent->gfid, child->pargfid); + + ret = 0; +out: + GF_FREE(child_path); + + return ret; +} void -afr_build_parent_loc (loc_t *parent, loc_t *child) +__dir_entry_fop_common_cbk (call_frame_t *frame, int child_index, + xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, struct iatt *prenewparent, + struct iatt *postnewparent) { - char *tmp = NULL; + afr_local_t *local = NULL; - if (!child->parent) { - //this should never be called with root as the child - GF_ASSERT (0); - loc_copy (parent, child); - return; + local = frame->local; + + if (afr_fop_failed (op_ret, op_errno)) + afr_transaction_fop_failed (frame, this, child_index); + + if (op_ret > -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) || + (child_index == local->read_child_index)) { + local->cont.dir_fop.preparent = *preparent; + local->cont.dir_fop.postparent = *postparent; + if (buf) + local->cont.dir_fop.buf = *buf; + if (prenewparent) + local->cont.dir_fop.prenewparent = *prenewparent; + if (postnewparent) + local->cont.dir_fop.postnewparent = *postnewparent; + } + + local->cont.dir_fop.inode = inode; + + local->fresh_children[local->success_count] = child_index; + local->success_count++; + local->child_errno[child_index] = 0; + } else { + local->child_errno[child_index] = op_errno; } - tmp = gf_strdup (child->path); - parent->path = gf_strdup (dirname (tmp)); - GF_FREE (tmp); + local->op_errno = op_errno; +} - parent->name = strrchr (parent->path, '/'); - if (parent->name) - parent->name++; +int +afr_mark_new_entry_changelog_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t *xattr, dict_t *xdata) +{ + int call_count = 0; - parent->inode = inode_ref (child->parent); - parent->parent = inode_parent (parent->inode, 0, NULL); - parent->ino = parent->inode->ino; + call_count = afr_frame_return (frame); + if (call_count == 0) { + AFR_STACK_DESTROY (frame); + } + return 0; +} + +void +afr_mark_new_entry_changelog (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *new_frame = NULL; + afr_local_t *local = NULL; + afr_local_t *new_local = NULL; + afr_private_t *priv = NULL; + dict_t **xattr = NULL; + int32_t **changelog = NULL; + int i = 0; + GF_UNUSED int op_errno = 0; + + local = frame->local; + priv = this->private; + + new_frame = copy_frame (frame); + if (!new_frame) { + goto out; + } + + AFR_LOCAL_ALLOC_OR_GOTO (new_frame->local, out); + new_local = new_frame->local; + changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!changelog) + goto out; + + xattr = GF_CALLOC (priv->child_count, sizeof (*xattr), + gf_afr_mt_dict_t); + if (!xattr) + goto out; + for (i = 0; i < priv->child_count; i++) { + if (local->child_errno[i]) + continue; + xattr[i] = dict_new (); + if (!xattr[i]) + goto out; + } + + afr_prepare_new_entry_pending_matrix (changelog, + afr_is_errno_set, + local->child_errno, + &local->cont.dir_fop.buf, + priv->child_count); + + new_local->pending = changelog; + uuid_copy (new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid); + new_local->loc.inode = inode_ref (local->cont.dir_fop.inode); + new_local->call_count = local->success_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_errno[i]) + continue; + + afr_set_pending_dict (priv, xattr[i], changelog, i, LOCAL_LAST); + STACK_WIND_COOKIE (new_frame, afr_mark_new_entry_changelog_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->xattrop, + &new_local->loc, GF_XATTROP_ADD_ARRAY, + xattr[i], NULL); + } + new_frame = NULL; +out: + if (new_frame) + AFR_STACK_DESTROY (new_frame); + afr_xattr_array_destroy (xattr, priv->child_count); + return; +} + +gf_boolean_t +afr_is_new_entry_changelog_needed (glusterfs_fop_t fop) +{ + glusterfs_fop_t fops[] = {GF_FOP_CREATE, GF_FOP_MKNOD, GF_FOP_NULL}; + int i = 0; + + for (i = 0; fops[i] != GF_FOP_NULL; i++) { + if (fop == fops[i]) + return _gf_true; + } + return _gf_false; +} + +void +afr_dir_fop_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (local->op_ret < 0) + goto out; + + if (local->success_count == priv->child_count) + goto out; + + if (!afr_is_new_entry_changelog_needed (local->op)) + goto out; + + afr_mark_new_entry_changelog (frame, this); + +out: + return; +} + +void +afr_dir_fop_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; - if (!uuid_is_null (child->pargfid)) - uuid_copy (parent->gfid, child->pargfid); + if (local->cont.dir_fop.inode == NULL) + goto done; + afr_set_read_ctx_from_policy (this, local->cont.dir_fop.inode, + local->fresh_children, + local->read_child_index, + priv->read_child, + local->cont.dir_fop.buf.ia_gfid); +done: + local->transaction.unwind (frame, this); + afr_dir_fop_mark_entry_pending_changelog (frame, this); + local->transaction.resume (frame, this); } /* {{{ create */ @@ -83,7 +263,6 @@ afr_create_unwind (call_frame_t *frame, xlator_t *this) { call_frame_t *main_frame = NULL; afr_local_t *local = NULL; - struct iatt *unwind_buf = NULL; local = frame->local; @@ -97,18 +276,14 @@ afr_create_unwind (call_frame_t *frame, xlator_t *this) UNLOCK (&frame->lock); if (main_frame) { - if (local->cont.create.read_child_buf.ia_ino) { - unwind_buf = &local->cont.create.read_child_buf; - } else { - unwind_buf = &local->cont.create.buf; - } - AFR_STACK_UNWIND (create, main_frame, local->op_ret, local->op_errno, local->cont.create.fd, - local->cont.create.inode, - unwind_buf, &local->cont.create.preparent, - &local->cont.create.postparent); + local->cont.dir_fop.inode, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + local->xdata_rsp); } return 0; @@ -119,32 +294,24 @@ int afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, struct iatt *buf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { afr_local_t *local = NULL; - afr_private_t *priv = NULL; uint64_t ctx = 0; afr_fd_ctx_t *fd_ctx = NULL; int ret = 0; int call_count = -1; int child_index = -1; - int32_t *fresh_children = NULL; local = frame->local; - priv = this->private; child_index = (long) cookie; LOCK (&frame->lock); { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - + if (op_ret > -1) { ret = afr_fd_ctx_set (this, fd); - if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, "could not set ctx on fd=%p", fd); @@ -155,7 +322,6 @@ afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, "could not get fd ctx for fd=%p", fd); @@ -169,23 +335,14 @@ afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, fd_ctx->opened_on[child_index] = AFR_FD_OPENED; fd_ctx->flags = local->cont.create.flags; - if (local->success_count == 0) - local->cont.create.buf = *buf; - - if (child_index == local->read_child_index) { - local->cont.create.read_child_buf = *buf; - local->cont.create.preparent = *preparent; - local->cont.create.postparent = *postparent; - } - - local->cont.create.inode = inode; - - fresh_children = local->fresh_children; - fresh_children[local->success_count] = child_index; - local->success_count++; + if (local->success_count == 0) { + if (xdata) + local->xdata_rsp = dict_ref(xdata); + } } - - local->op_errno = op_errno; + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, inode, buf, + preparent, postparent, NULL, NULL); } unlock: @@ -193,15 +350,8 @@ unlock: call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_set_read_ctx_from_policy (this, inode, - local->fresh_children, - local->read_child_index, - priv->read_child); - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } + if (call_count == 0) + afr_dir_fop_done (frame, this); return 0; } @@ -237,8 +387,9 @@ afr_create_wind (call_frame_t *frame, xlator_t *this) &local->loc, local->cont.create.flags, local->cont.create.mode, + local->umask, local->cont.create.fd, - local->cont.create.params); + local->xdata_req); if (!--call_count) break; } @@ -266,14 +417,14 @@ afr_create_done (call_frame_t *frame, xlator_t *this) int afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) + mode_t umask, fd_t *fd, dict_t *params) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -281,20 +432,20 @@ afr_create (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(create,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; loc_copy (&local->loc, loc); @@ -305,30 +456,49 @@ afr_create (call_frame_t *frame, xlator_t *this, } UNLOCK (&priv->read_child_lock); + local->op = GF_FOP_CREATE; local->cont.create.flags = flags; local->cont.create.mode = mode; local->cont.create.fd = fd_ref (fd); + local->umask = umask; if (params) - local->cont.create.params = dict_ref (params); + local->xdata_req = dict_ref (params); local->transaction.fop = afr_create_wind; local->transaction.done = afr_create_done; local->transaction.unwind = afr_create_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, loc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (create, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL, NULL); + AFR_STACK_UNWIND (create, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL, NULL); } return 0; @@ -343,7 +513,6 @@ afr_mknod_unwind (call_frame_t *frame, xlator_t *this) { call_frame_t *main_frame = NULL; afr_local_t *local = NULL; - struct iatt *unwind_buf = NULL; local = frame->local; @@ -357,17 +526,13 @@ afr_mknod_unwind (call_frame_t *frame, xlator_t *this) UNLOCK (&frame->lock); if (main_frame) { - if (local->cont.mknod.read_child_buf.ia_ino) { - unwind_buf = &local->cont.mknod.read_child_buf; - } else { - unwind_buf = &local->cont.mknod.buf; - } - AFR_STACK_UNWIND (mknod, main_frame, local->op_ret, local->op_errno, - local->cont.mknod.inode, - unwind_buf, &local->cont.mknod.preparent, - &local->cont.mknod.postparent); + local->cont.dir_fop.inode, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + NULL); } return 0; @@ -378,58 +543,25 @@ int afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; int call_count = -1; int child_index = -1; - int32_t *fresh_children = NULL; - - local = frame->local; - priv = this->private; child_index = (long) cookie; LOCK (&frame->lock); { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0) - local->cont.mknod.buf = *buf; - - if (child_index == local->read_child_index) { - local->cont.mknod.read_child_buf = *buf; - local->cont.mknod.preparent = *preparent; - local->cont.mknod.postparent = *postparent; - } - - local->cont.mknod.inode = inode; - - fresh_children = local->fresh_children; - fresh_children[local->success_count] = child_index; - local->success_count++; - } - - local->op_errno = op_errno; + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, inode, buf, + preparent, postparent, NULL, NULL); } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_set_read_ctx_from_policy (this, inode, - local->fresh_children, - local->read_child_index, - priv->read_child); - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } + if (call_count == 0) + afr_dir_fop_done (frame, this); return 0; } @@ -463,7 +595,8 @@ afr_mknod_wind (call_frame_t *frame, xlator_t *this) priv->children[i]->fops->mknod, &local->loc, local->cont.mknod.mode, local->cont.mknod.dev, - local->cont.mknod.params); + local->umask, + local->xdata_req); if (!--call_count) break; } @@ -488,15 +621,15 @@ afr_mknod_done (call_frame_t *frame, xlator_t *this) int -afr_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t dev, dict_t *params) +afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t dev, mode_t umask, dict_t *params) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -504,20 +637,20 @@ afr_mknod (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(mknod,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; loc_copy (&local->loc, loc); @@ -528,29 +661,48 @@ afr_mknod (call_frame_t *frame, xlator_t *this, } UNLOCK (&priv->read_child_lock); + local->op = GF_FOP_MKNOD; local->cont.mknod.mode = mode; local->cont.mknod.dev = dev; + local->umask = umask; if (params) - local->cont.mknod.params = dict_ref (params); + local->xdata_req = dict_ref (params); local->transaction.fop = afr_mknod_wind; local->transaction.done = afr_mknod_done; local->transaction.unwind = afr_mknod_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, loc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (mknod, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL); + AFR_STACK_UNWIND (mknod, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); } return 0; @@ -566,7 +718,6 @@ afr_mkdir_unwind (call_frame_t *frame, xlator_t *this) { call_frame_t *main_frame = NULL; afr_local_t *local = NULL; - struct iatt *unwind_buf = NULL; local = frame->local; @@ -580,17 +731,13 @@ afr_mkdir_unwind (call_frame_t *frame, xlator_t *this) UNLOCK (&frame->lock); if (main_frame) { - if (local->cont.mkdir.read_child_buf.ia_ino) { - unwind_buf = &local->cont.mkdir.read_child_buf; - } else { - unwind_buf = &local->cont.mkdir.buf; - } - AFR_STACK_UNWIND (mkdir, main_frame, local->op_ret, local->op_errno, - local->cont.mkdir.inode, - unwind_buf, &local->cont.mkdir.preparent, - &local->cont.mkdir.postparent); + local->cont.dir_fop.inode, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + NULL); } return 0; @@ -601,58 +748,25 @@ int afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; int call_count = -1; int child_index = -1; - int32_t *fresh_children = NULL; - - local = frame->local; - priv = this->private; child_index = (long) cookie; LOCK (&frame->lock); { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0) - local->cont.mkdir.buf = *buf; - - if (child_index == local->read_child_index) { - local->cont.mkdir.read_child_buf = *buf; - local->cont.mkdir.preparent = *preparent; - local->cont.mkdir.postparent = *postparent; - } - - local->cont.mkdir.inode = inode; - - fresh_children = local->fresh_children; - fresh_children[local->success_count] = child_index; - local->success_count++; - } - - local->op_errno = op_errno; + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, inode, buf, + preparent, postparent, NULL, NULL); } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_set_read_ctx_from_policy (this, inode, - local->fresh_children, - local->read_child_index, - priv->read_child); - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } + if (call_count == 0) + afr_dir_fop_done (frame, this); return 0; } @@ -686,7 +800,8 @@ afr_mkdir_wind (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->mkdir, &local->loc, local->cont.mkdir.mode, - local->cont.mkdir.params); + local->umask, + local->xdata_req); if (!--call_count) break; } @@ -710,17 +825,16 @@ afr_mkdir_done (call_frame_t *frame, xlator_t *this) return 0; } - int afr_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *params) + loc_t *loc, mode_t mode, mode_t umask, dict_t *params) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -728,20 +842,20 @@ afr_mkdir (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(mkdir,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; loc_copy (&local->loc, loc); @@ -753,28 +867,47 @@ afr_mkdir (call_frame_t *frame, xlator_t *this, UNLOCK (&priv->read_child_lock); local->cont.mkdir.mode = mode; + local->umask = umask; if (params) - local->cont.mkdir.params = dict_ref (params); + local->xdata_req = dict_ref (params); + local->op = GF_FOP_MKDIR; local->transaction.fop = afr_mkdir_wind; local->transaction.done = afr_mkdir_done; local->transaction.unwind = afr_mkdir_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, loc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (mkdir, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL); + AFR_STACK_UNWIND (mkdir, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); } return 0; @@ -790,7 +923,6 @@ afr_link_unwind (call_frame_t *frame, xlator_t *this) { call_frame_t *main_frame = NULL; afr_local_t *local = NULL; - struct iatt *unwind_buf = NULL; local = frame->local; @@ -804,17 +936,13 @@ afr_link_unwind (call_frame_t *frame, xlator_t *this) UNLOCK (&frame->lock); if (main_frame) { - if (local->cont.link.read_child_buf.ia_ino) { - unwind_buf = &local->cont.link.read_child_buf; - } else { - unwind_buf = &local->cont.link.buf; - } - AFR_STACK_UNWIND (link, main_frame, local->op_ret, local->op_errno, - local->cont.link.inode, - unwind_buf, &local->cont.link.preparent, - &local->cont.link.postparent); + local->cont.dir_fop.inode, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + NULL); } return 0; @@ -825,59 +953,25 @@ int afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; int call_count = -1; int child_index = -1; - int32_t *fresh_children = NULL; - - local = frame->local; - priv = this->private; child_index = (long) cookie; LOCK (&frame->lock); { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0) { - local->cont.link.buf = *buf; - } - - if (child_index == local->read_child_index) { - local->cont.link.read_child_buf = *buf; - local->cont.link.preparent = *preparent; - local->cont.link.postparent = *postparent; - } - - local->cont.link.inode = inode; - - fresh_children = local->fresh_children; - fresh_children[local->success_count] = child_index; - local->success_count++; - } - - local->op_errno = op_errno; + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, inode, buf, + preparent, postparent, NULL, NULL); } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_set_read_ctx_from_policy (this, inode, - local->fresh_children, - local->read_child_index, - priv->read_child); - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } + if (call_count == 0) + afr_dir_fop_done (frame, this); return 0; } @@ -906,11 +1000,12 @@ afr_link_wind (call_frame_t *frame, xlator_t *this) for (i = 0; i < priv->child_count; i++) { if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i, + STACK_WIND_COOKIE (frame, afr_link_wind_cbk, + (void *) (long) i, priv->children[i], priv->children[i]->fops->link, &local->loc, - &local->newloc); + &local->newloc, local->xdata_req); if (!--call_count) break; @@ -936,14 +1031,14 @@ afr_link_done (call_frame_t *frame, xlator_t *this) int afr_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) + loc_t *oldloc, loc_t *newloc, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -951,23 +1046,25 @@ afr_link (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(link,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; loc_copy (&local->loc, oldloc); loc_copy (&local->newloc, newloc); + if (xdata) + local->xdata_req = dict_ref (xdata); LOCK (&priv->read_child_lock); { @@ -976,25 +1073,41 @@ afr_link (call_frame_t *frame, xlator_t *this, } UNLOCK (&priv->read_child_lock); + local->op = GF_FOP_LINK; local->transaction.fop = afr_link_wind; local->transaction.done = afr_link_done; local->transaction.unwind = afr_link_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, oldloc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, newloc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (oldloc->path); - local->transaction.new_basename = AFR_BASENAME (newloc->path); - - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + local->transaction.basename = AFR_BASENAME (newloc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; - op_ret = 0; + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (link, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL); + AFR_STACK_UNWIND (link, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); } return 0; @@ -1010,7 +1123,6 @@ afr_symlink_unwind (call_frame_t *frame, xlator_t *this) { call_frame_t *main_frame = NULL; afr_local_t *local = NULL; - struct iatt *unwind_buf = NULL; local = frame->local; @@ -1024,17 +1136,13 @@ afr_symlink_unwind (call_frame_t *frame, xlator_t *this) UNLOCK (&frame->lock); if (main_frame) { - if (local->cont.symlink.read_child_buf.ia_ino) { - unwind_buf = &local->cont.symlink.read_child_buf; - } else { - unwind_buf = &local->cont.symlink.buf; - } - AFR_STACK_UNWIND (symlink, main_frame, local->op_ret, local->op_errno, - local->cont.symlink.inode, - unwind_buf, &local->cont.symlink.preparent, - &local->cont.symlink.postparent); + local->cont.dir_fop.inode, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + NULL); } return 0; @@ -1045,58 +1153,25 @@ int afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; int call_count = -1; int child_index = -1; - int32_t *fresh_children = NULL; - - local = frame->local; - priv = this->private; child_index = (long) cookie; LOCK (&frame->lock); { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0) - local->cont.symlink.buf = *buf; - - if (child_index == local->read_child_index) { - local->cont.symlink.read_child_buf = *buf; - local->cont.symlink.preparent = *preparent; - local->cont.symlink.postparent = *postparent; - } - - local->cont.symlink.inode = inode; - - fresh_children = local->fresh_children; - fresh_children[local->success_count] = child_index; - local->success_count++; - } - - local->op_errno = op_errno; + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, inode, buf, + preparent, postparent, NULL, NULL); } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_set_read_ctx_from_policy (this, inode, - local->fresh_children, - local->read_child_index, - priv->read_child); - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } + if (call_count == 0) + afr_dir_fop_done (frame, this); return 0; } @@ -1131,7 +1206,8 @@ afr_symlink_wind (call_frame_t *frame, xlator_t *this) priv->children[i]->fops->symlink, local->cont.symlink.linkpath, &local->loc, - local->cont.symlink.params); + local->umask, + local->xdata_req); if (!--call_count) break; @@ -1158,14 +1234,14 @@ afr_symlink_done (call_frame_t *frame, xlator_t *this) int afr_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *loc, dict_t *params) + const char *linkpath, loc_t *loc, mode_t umask, dict_t *params) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -1173,20 +1249,20 @@ afr_symlink (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(symlink,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; loc_copy (&local->loc, loc); @@ -1198,27 +1274,46 @@ afr_symlink (call_frame_t *frame, xlator_t *this, UNLOCK (&priv->read_child_lock); local->cont.symlink.linkpath = gf_strdup (linkpath); + local->umask = umask; if (params) - local->cont.symlink.params = dict_ref (params); + local->xdata_req = dict_ref (params); + local->op = GF_FOP_SYMLINK; local->transaction.fop = afr_symlink_wind; local->transaction.done = afr_symlink_done; local->transaction.unwind = afr_symlink_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, loc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (symlink, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL); + AFR_STACK_UNWIND (symlink, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); } return 0; @@ -1233,7 +1328,6 @@ afr_rename_unwind (call_frame_t *frame, xlator_t *this) { call_frame_t *main_frame = NULL; afr_local_t *local = NULL; - struct iatt *unwind_buf = NULL; local = frame->local; @@ -1247,19 +1341,14 @@ afr_rename_unwind (call_frame_t *frame, xlator_t *this) UNLOCK (&frame->lock); if (main_frame) { - if (local->cont.rename.read_child_buf.ia_ino) { - unwind_buf = &local->cont.rename.read_child_buf; - } else { - unwind_buf = &local->cont.rename.buf; - } - AFR_STACK_UNWIND (rename, main_frame, local->op_ret, local->op_errno, - unwind_buf, - &local->cont.rename.preoldparent, - &local->cont.rename.postoldparent, - &local->cont.rename.prenewparent, - &local->cont.rename.postnewparent); + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + &local->cont.dir_fop.prenewparent, + &local->cont.dir_fop.postnewparent, + NULL); } return 0; @@ -1270,7 +1359,8 @@ int afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { afr_local_t * local = NULL; int call_count = -1; @@ -1284,38 +1374,22 @@ afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { if (afr_fop_failed (op_ret, op_errno) && op_errno != ENOTEMPTY) afr_transaction_fop_failed (frame, this, child_index); + local->op_errno = op_errno; + local->child_errno[child_index] = op_errno; - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - - if (buf) { - local->cont.rename.buf = *buf; - } - - local->success_count++; - } - - if (child_index == local->read_child_index) { - local->cont.rename.read_child_buf = *buf; + if (op_ret > -1) + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, NULL, buf, + preoldparent, postoldparent, + prenewparent, postnewparent); - local->cont.rename.preoldparent = *preoldparent; - local->cont.rename.postoldparent = *postoldparent; - local->cont.rename.prenewparent = *prenewparent; - local->cont.rename.postnewparent = *postnewparent; - } - } - - local->op_errno = op_errno; } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); - if (call_count == 0) { - local->transaction.unwind (frame, this); - local->transaction.resume (frame, this); - } + if (call_count == 0) + afr_dir_fop_done (frame, this); return 0; } @@ -1349,7 +1423,7 @@ afr_rename_wind (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->rename, &local->loc, - &local->newloc); + &local->newloc, NULL); if (!--call_count) break; } @@ -1374,14 +1448,15 @@ afr_rename_done (call_frame_t *frame, xlator_t *this) int afr_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) + loc_t *oldloc, loc_t *newloc, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + int nlockee = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -1389,47 +1464,90 @@ afr_rename (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(rename,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; loc_copy (&local->loc, oldloc); loc_copy (&local->newloc, newloc); local->read_child_index = afr_inode_get_read_ctx (this, oldloc->inode, NULL); + local->op = GF_FOP_RENAME; local->transaction.fop = afr_rename_wind; local->transaction.done = afr_rename_done; local->transaction.unwind = afr_rename_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, oldloc); - afr_build_parent_loc (&local->transaction.new_parent_loc, newloc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, oldloc, + &op_errno); + if (ret) + goto out; + ret = afr_build_parent_loc (&local->transaction.new_parent_loc, newloc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (oldloc->path); local->transaction.new_basename = AFR_BASENAME (newloc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = nlockee = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->transaction.new_parent_loc, + local->transaction.new_basename, + priv->child_count); + if (ret) + goto out; - afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); + nlockee++; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + nlockee++; + if (local->newloc.inode && IA_ISDIR (local->newloc.inode->ia_type)) { + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->newloc, + NULL, + priv->child_count); + if (ret) + goto out; + + nlockee++; + } + qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee), + afr_entry_lockee_cmp); + int_lock->lockee_count = nlockee; + + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (rename, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL, NULL); + AFR_STACK_UNWIND (rename, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL, NULL); } return 0; @@ -1459,8 +1577,9 @@ afr_unlink_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (unlink, main_frame, local->op_ret, local->op_errno, - &local->cont.unlink.preparent, - &local->cont.unlink.postparent); + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + NULL); } return 0; @@ -1470,7 +1589,7 @@ afr_unlink_unwind (call_frame_t *frame, xlator_t *this) int afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { afr_local_t * local = NULL; int call_count = -1; @@ -1483,36 +1602,15 @@ afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (child_index == local->read_child_index) { local->read_child_returned = _gf_true; } - - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.unlink.preparent = *preparent; - local->cont.unlink.postparent = *postparent; - } - - if (child_index == local->read_child_index) { - local->cont.unlink.preparent = *preparent; - local->cont.unlink.postparent = *postparent; - } - - local->success_count++; - } - - local->op_errno = op_errno; + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, NULL, NULL, + preparent, postparent, NULL, NULL); } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } + if (call_count == 0) + afr_dir_fop_done (frame, this); return 0; } @@ -1545,7 +1643,8 @@ afr_unlink_wind (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->unlink, - &local->loc); + &local->loc, local->xflag, + local->xdata_req); if (!--call_count) break; @@ -1571,14 +1670,14 @@ afr_unlink_done (call_frame_t *frame, xlator_t *this) int32_t afr_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc) + loc_t *loc, int xflag, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -1586,41 +1685,62 @@ afr_unlink (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(unlink,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; loc_copy (&local->loc, loc); + local->xflag = xflag; + if (xdata) + local->xdata_req = dict_ref (xdata); + local->op = GF_FOP_UNLINK; local->transaction.fop = afr_unlink_wind; local->transaction.done = afr_unlink_done; local->transaction.unwind = afr_unlink_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, loc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (unlink, frame, op_ret, op_errno, - NULL, NULL); + AFR_STACK_UNWIND (unlink, frame, -1, op_errno, + NULL, NULL, NULL); } return 0; @@ -1652,8 +1772,9 @@ afr_rmdir_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (rmdir, main_frame, local->op_ret, local->op_errno, - &local->cont.rmdir.preparent, - &local->cont.rmdir.postparent); + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + NULL); } return 0; @@ -1663,7 +1784,7 @@ afr_rmdir_unwind (call_frame_t *frame, xlator_t *this) int afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { afr_local_t * local = NULL; int call_count = -1; @@ -1677,36 +1798,22 @@ afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (child_index == read_child) { local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno) && (op_errno != ENOTEMPTY)) afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.rmdir.preparent = *preparent; - local->cont.rmdir.postparent = *postparent; - - } - - if (child_index == read_child) { - local->cont.rmdir.preparent = *preparent; - local->cont.rmdir.postparent = *postparent; - } - - local->success_count++; - } - local->op_errno = op_errno; + local->child_errno[child_index] = op_errno; + if (op_ret > -1) + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, NULL, NULL, + preparent, postparent, NULL, + NULL); + } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.unwind (frame, this); - local->transaction.resume (frame, this); - } + if (call_count == 0) + afr_dir_fop_done (frame, this); return 0; } @@ -1739,7 +1846,8 @@ afr_rmdir_wind (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->rmdir, - &local->loc, local->cont.rmdir.flags); + &local->loc, local->cont.rmdir.flags, + NULL); if (!--call_count) break; @@ -1765,14 +1873,15 @@ afr_rmdir_done (call_frame_t *frame, xlator_t *this) int afr_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags) + loc_t *loc, int flags, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + int nlockee = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -1780,42 +1889,71 @@ afr_rmdir (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(rmdir,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; local->cont.rmdir.flags = flags; loc_copy (&local->loc, loc); + local->op = GF_FOP_RMDIR; local->transaction.fop = afr_rmdir_wind; local->transaction.done = afr_rmdir_done; local->transaction.unwind = afr_rmdir_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, loc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = nlockee = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + nlockee++; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->loc, + NULL, + priv->child_count); + if (ret) + goto out; - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + nlockee++; + qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee), + afr_entry_lockee_cmp); + int_lock->lockee_count = nlockee; - op_ret = 0; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (rmdir, frame, op_ret, op_errno, - NULL, NULL); + AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL); } return 0; diff --git a/xlators/cluster/afr/src/afr-dir-write.h b/xlators/cluster/afr/src/afr-dir-write.h index 0290c6350..02f0a3682 100644 --- a/xlators/cluster/afr/src/afr-dir-write.h +++ b/xlators/cluster/afr/src/afr-dir-write.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __DIR_WRITE_H__ @@ -23,38 +14,34 @@ int32_t afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params); + mode_t umask, fd_t *fd, dict_t *xdata); int32_t afr_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t dev, dict_t *params); + loc_t *loc, mode_t mode, dev_t dev, mode_t umask, dict_t *xdata); int32_t afr_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *params); + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata); int32_t afr_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc); + loc_t *loc, int xflag, dict_t *xdata); int32_t afr_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags); + loc_t *loc, int flags, dict_t *xdata); int32_t afr_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc); + loc_t *oldloc, loc_t *newloc, dict_t *xdata); int32_t afr_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc); + loc_t *oldloc, loc_t *newloc, dict_t *xdata); int afr_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *oldloc, dict_t *params); - -int32_t -afr_setdents (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count); + const char *linkpath, loc_t *oldloc, mode_t umask, dict_t *params); #endif /* __DIR_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index 82a9d27c0..e06e3b2f2 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -58,7 +49,7 @@ int32_t afr_access_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; @@ -92,12 +83,13 @@ afr_access_cbk (call_frame_t *frame, void *cookie, (void *) (long) read_child, children[next_call_child], children[next_call_child]->fops->access, - &local->loc, local->cont.access.mask); + &local->loc, local->cont.access.mask, + NULL); } out: if (unwind) { - AFR_STACK_UNWIND (access, frame, op_ret, op_errno); + AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata); } return 0; @@ -105,15 +97,16 @@ out: int32_t -afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask) +afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) { afr_private_t *priv = NULL; xlator_t **children = NULL; int call_child = 0; afr_local_t *local = NULL; - int32_t op_ret = -1; int32_t op_errno = 0; int32_t read_child = -1; + int ret = -1; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -124,14 +117,14 @@ afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask) children = priv->children; - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; + AFR_SBRAIN_CHECK_LOC (loc, out); - op_ret = AFR_LOCAL_INIT (local, priv); - if (op_ret < 0) { - op_errno = -op_ret; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } local->fresh_children = afr_children_create (priv->child_count); if (!local->fresh_children) { @@ -142,13 +135,12 @@ afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask) read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, + ret = afr_get_call_child (this, local->child_up, read_child, local->fresh_children, &call_child, &local->cont.access.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; + if (ret < 0) { + op_errno = -ret; goto out; } @@ -159,13 +151,12 @@ afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask) (void *) (long) call_child, children[call_child], children[call_child]->fops->access, - loc, mask); + loc, mask, xdata); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (access, frame, op_ret, op_errno); - } + if (ret < 0) + AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL); return 0; } @@ -177,7 +168,7 @@ out: int32_t afr_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf) + struct iatt *buf, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; @@ -211,12 +202,12 @@ afr_stat_cbk (call_frame_t *frame, void *cookie, (void *) (long) read_child, children[next_call_child], children[next_call_child]->fops->stat, - &local->loc); + &local->loc, NULL); } out: if (unwind) { - AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf); + AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); } return 0; @@ -224,15 +215,15 @@ out: int32_t -afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) +afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; xlator_t **children = NULL; int call_child = 0; - int32_t op_ret = -1; int32_t op_errno = 0; int32_t read_child = -1; + int ret = -1; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -243,13 +234,14 @@ afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) children = priv->children; - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; - op_ret = AFR_LOCAL_INIT (local, priv); - if (op_ret < 0) { - op_errno = -op_ret; + AFR_SBRAIN_CHECK_LOC (loc, out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } local->fresh_children = afr_children_create (priv->child_count); if (!local->fresh_children) { @@ -259,13 +251,12 @@ afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, + ret = afr_get_call_child (this, local->child_up, read_child, local->fresh_children, &call_child, &local->cont.stat.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; + if (ret < 0) { + op_errno = -ret; goto out; } loc_copy (&local->loc, loc); @@ -273,13 +264,12 @@ afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child, children[call_child], children[call_child]->fops->stat, - loc); + loc, xdata); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); return 0; } @@ -291,7 +281,8 @@ out: int32_t afr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf) + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -325,12 +316,12 @@ afr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, (void *) (long) read_child, children[next_call_child], children[next_call_child]->fops->fstat, - local->fd); + local->fd, NULL); } out: if (unwind) { - AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf); + AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); } return 0; @@ -339,15 +330,15 @@ out: int32_t afr_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd) + fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; xlator_t **children = NULL; int call_child = 0; - int32_t op_ret = -1; int32_t op_errno = 0; int32_t read_child = 0; + int ret = -1; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -361,14 +352,14 @@ afr_fstat (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (fd->inode, out); - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; + AFR_SBRAIN_CHECK_FD (fd, out); - op_ret = AFR_LOCAL_INIT (local, priv); - if (op_ret < 0) { - op_errno = -op_ret; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } local->fresh_children = afr_children_create (priv->child_count); if (!local->fresh_children) { @@ -381,34 +372,28 @@ afr_fstat (call_frame_t *frame, xlator_t *this, - op_ret = afr_get_call_child (this, local->child_up, read_child, + ret = afr_get_call_child (this, local->child_up, read_child, local->fresh_children, &call_child, &local->cont.fstat.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; + if (ret < 0) { + op_errno = -ret; goto out; } local->fd = fd_ref (fd); - op_ret = afr_open_fd_fix (frame, this, _gf_false); - if (op_ret) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } + afr_open_fd_fix (fd, this); + STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child, children[call_child], children[call_child]->fops->fstat, - fd); + fd, xdata); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); return 0; } @@ -420,7 +405,7 @@ out: int32_t afr_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - const char *buf, struct iatt *sbuf) + const char *buf, struct iatt *sbuf, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; @@ -454,12 +439,13 @@ afr_readlink_cbk (call_frame_t *frame, void *cookie, children[next_call_child], children[next_call_child]->fops->readlink, &local->loc, - local->cont.readlink.size); + local->cont.readlink.size, NULL); } out: if (unwind) { - AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, buf, sbuf); + AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, buf, sbuf, + xdata); } return 0; @@ -468,15 +454,15 @@ out: int32_t afr_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size) + loc_t *loc, size_t size, dict_t *xdata) { afr_private_t *priv = NULL; xlator_t **children = NULL; int call_child = 0; afr_local_t *local = NULL; - int32_t op_ret = -1; int32_t op_errno = 0; int32_t read_child = -1; + int ret = -1; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -487,13 +473,14 @@ afr_readlink (call_frame_t *frame, xlator_t *this, children = priv->children; - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; - op_ret = AFR_LOCAL_INIT (local, priv); - if (op_ret < 0) { - op_errno = -op_ret; + AFR_SBRAIN_CHECK_LOC (loc, out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } local->fresh_children = afr_children_create (priv->child_count); if (!local->fresh_children) { @@ -502,13 +489,12 @@ afr_readlink (call_frame_t *frame, xlator_t *this, } read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, + ret = afr_get_call_child (this, local->child_up, read_child, local->fresh_children, &call_child, &local->cont.readlink.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; + if (ret < 0) { + op_errno = -ret; goto out; } @@ -520,13 +506,12 @@ afr_readlink (call_frame_t *frame, xlator_t *this, (void *) (long) call_child, children[call_child], children[call_child]->fops->readlink, - loc, size); + loc, size, xdata); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, NULL, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -541,7 +526,7 @@ struct _xattr_key { }; -void +int __gather_xattr_keys (dict_t *dict, char *key, data_t *value, void *data) { @@ -553,13 +538,14 @@ __gather_xattr_keys (dict_t *dict, char *key, data_t *value, xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key); if (!xkey) - return; + return -1; xkey->key = key; INIT_LIST_HEAD (&xkey->list); list_add_tail (&xkey->list, list); } + return 0; } @@ -589,7 +575,7 @@ __filter_xattrs (dict_t *dict) int32_t afr_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict) + dict_t *dict, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; @@ -623,7 +609,8 @@ afr_getxattr_cbk (call_frame_t *frame, void *cookie, children[next_call_child], children[next_call_child]->fops->getxattr, &local->loc, - local->cont.getxattr.name); + local->cont.getxattr.name, + NULL); } out: @@ -631,39 +618,620 @@ out: if (op_ret >= 0 && dict) __filter_xattrs (dict); - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); + } + + return 0; +} + +int32_t +afr_getxattr_unwind (call_frame_t *frame, int op_ret, int op_errno, + dict_t *dict, dict_t *xdata) + +{ + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + +int32_t +afr_fgetxattr_clrlk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + dict_t *xattr = NULL; + char *tmp_report = NULL; + char lk_summary[1024] = {0,}; + int serz_len = 0; + int32_t callcnt = 0; + long int cky = 0; + int ret = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + cky = (long) cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) + local->child_errno[cky] = op_errno; + + if (!local->dict) + local->dict = dict_new (); + if (local->dict) { + ret = dict_get_str (dict, local->cont.getxattr.name, + &tmp_report); + if (ret) + goto unlock; + ret = dict_set_dynstr (local->dict, + children[cky]->name, + gf_strdup (tmp_report)); + if (ret) + goto unlock; + } + } +unlock: + UNLOCK (&frame->lock); + + if (!callcnt) { + xattr = dict_new (); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + ret = dict_serialize_value_with_delim (local->dict, + lk_summary, + &serz_len, '\n'); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Error serializing dictionary"); + goto unwind; + } + if (serz_len == -1) + snprintf (lk_summary, sizeof (lk_summary), + "No locks cleared."); + ret = dict_set_dynstr (xattr, local->cont.getxattr.name, + gf_strdup (lk_summary)); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Error setting dictionary"); + goto unwind; + } + + unwind: + // Updating child_errno with more recent 'events' + local->child_errno[cky] = op_errno; + op_errno = afr_resultant_errno_get (NULL, local->child_errno, + priv->child_count); + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr, + xdata); + + if (xattr) + dict_unref (xattr); + } + + return ret; +} + +int32_t +afr_getxattr_clrlk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + dict_t *xattr = NULL; + char *tmp_report = NULL; + char lk_summary[1024] = {0,}; + int serz_len = 0; + int32_t callcnt = 0; + long int cky = 0; + int ret = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + cky = (long) cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) + local->child_errno[cky] = op_errno; + + if (!local->dict) + local->dict = dict_new (); + if (local->dict) { + ret = dict_get_str (dict, local->cont.getxattr.name, + &tmp_report); + if (ret) + goto unlock; + ret = dict_set_dynstr (local->dict, + children[cky]->name, + gf_strdup (tmp_report)); + if (ret) + goto unlock; + } + } +unlock: + UNLOCK (&frame->lock); + + if (!callcnt) { + xattr = dict_new (); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + ret = dict_serialize_value_with_delim (local->dict, + lk_summary, + &serz_len, '\n'); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Error serializing dictionary"); + goto unwind; + } + if (serz_len == -1) + snprintf (lk_summary, sizeof (lk_summary), + "No locks cleared."); + ret = dict_set_dynstr (xattr, local->cont.getxattr.name, + gf_strdup (lk_summary)); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Error setting dictionary"); + goto unwind; + } + + unwind: + // Updating child_errno with more recent 'events' + local->child_errno[cky] = op_errno; + op_errno = afr_resultant_errno_get (NULL, local->child_errno, + priv->child_count); + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata); + + if (xattr) + dict_unref (xattr); + } + + return ret; +} + +/** + * node-uuid cbk uses next child querying mechanism + */ +int32_t +afr_getxattr_node_uuid_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t **children = NULL; + int unwind = 1; + int curr_call_child = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { /** query the _next_ child */ + + /** + * _current_ becomes _next_ + * If done with all childs and yet no success; give up ! + */ + curr_call_child = (int) ((long)cookie); + if (++curr_call_child == priv->child_count) + goto unwind; + + gf_log (this->name, GF_LOG_WARNING, + "op_ret (-1): Re-querying afr-child (%d/%d)", + curr_call_child, priv->child_count); + + unwind = 0; + STACK_WIND_COOKIE (frame, afr_getxattr_node_uuid_cbk, + (void *) (long) curr_call_child, + children[curr_call_child], + children[curr_call_child]->fops->getxattr, + &local->loc, + local->cont.getxattr.name, + NULL); } + unwind: + if (unwind) + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, + NULL); + return 0; } int32_t -afr_getxattr_unwind (call_frame_t *frame, - int op_ret, int op_errno, dict_t *dict) +afr_getxattr_lockinfo_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + int call_cnt = 0, len = 0; + char *lockinfo_buf = NULL; + dict_t *lockinfo = NULL, *newdict = NULL; + afr_local_t *local = NULL; + + LOCK (&frame->lock); + { + local = frame->local; + + call_cnt = --local->call_count; + + if ((op_ret < 0) || (!dict && !xdata)) { + goto unlock; + } + + if (xdata) { + if (!local->xdata_rsp) { + local->xdata_rsp = dict_new (); + if (!local->xdata_rsp) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } + + if (!dict) { + goto unlock; + } + + op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY, + (void **)&lockinfo_buf, &len); + + if (!lockinfo_buf) { + goto unlock; + } + + if (!local->dict) { + local->dict = dict_new (); + if (!local->dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } +unlock: + UNLOCK (&frame->lock); + + if (lockinfo_buf != NULL) { + lockinfo = dict_new (); + if (lockinfo == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + } else { + op_ret = dict_unserialize (lockinfo_buf, len, + &lockinfo); + + if (lockinfo && local->dict) { + dict_copy (lockinfo, local->dict); + } + } + } + + if (xdata && local->xdata_rsp) { + dict_copy (xdata, local->xdata_rsp); + } + + if (!call_cnt) { + newdict = dict_new (); + if (!newdict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } + + len = dict_serialized_length (local->dict); + if (len == 0) { + goto unwind; + } + + lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char); + if (!lockinfo_buf) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } + + op_ret = dict_serialize (local->dict, lockinfo_buf); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + } + + op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY, + (void *)lockinfo_buf, len); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + goto unwind; + } + + unwind: + AFR_STACK_UNWIND (getxattr, frame, op_ret, + op_errno, newdict, + local->xdata_rsp); + } + + dict_unref (lockinfo); + return 0; +} + +int32_t +afr_fgetxattr_lockinfo_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) { - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); + int call_cnt = 0, len = 0; + char *lockinfo_buf = NULL; + dict_t *lockinfo = NULL, *newdict = NULL; + afr_local_t *local = NULL; + + LOCK (&frame->lock); + { + local = frame->local; + + call_cnt = --local->call_count; + + if ((op_ret < 0) || (!dict && !xdata)) { + goto unlock; + } + + if (xdata) { + if (!local->xdata_rsp) { + local->xdata_rsp = dict_new (); + if (!local->xdata_rsp) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } + + if (!dict) { + goto unlock; + } + + op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY, + (void **)&lockinfo_buf, &len); + + if (!lockinfo_buf) { + goto unlock; + } + + if (!local->dict) { + local->dict = dict_new (); + if (!local->dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } +unlock: + UNLOCK (&frame->lock); + + if (lockinfo_buf != NULL) { + lockinfo = dict_new (); + if (lockinfo == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + } else { + op_ret = dict_unserialize (lockinfo_buf, len, + &lockinfo); + + if (lockinfo && local->dict) { + dict_copy (lockinfo, local->dict); + } + } + } + + if (xdata && local->xdata_rsp) { + dict_copy (xdata, local->xdata_rsp); + } + + if (!call_cnt) { + newdict = dict_new (); + if (!newdict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } + + len = dict_serialized_length (local->dict); + if (len <= 0) { + goto unwind; + } + + lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char); + if (!lockinfo_buf) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } + + op_ret = dict_serialize (local->dict, lockinfo_buf); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + } + + op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY, + (void *)lockinfo_buf, len); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + goto unwind; + } + + unwind: + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, + op_errno, newdict, + local->xdata_rsp); + } + + dict_unref (lockinfo); + return 0; } int32_t +afr_fgetxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t callcnt = 0; + int ret = 0; + char *xattr = NULL; + char *xattr_serz = NULL; + char xattr_cky[1024] = {0,}; + dict_t *nxattr = NULL; + long cky = 0; + int32_t padding = 0; + int32_t tlen = 0; + + if (!frame || !frame->local || !this) { + gf_log ("", GF_LOG_ERROR, "possible NULL deref"); + goto out; + } + + local = frame->local; + cky = (long) cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (!dict || (op_ret < 0)) + goto out; + + if (!local->dict) + local->dict = dict_new (); + + if (local->dict) { + ret = dict_get_str (dict, + local->cont.getxattr.name, + &xattr); + if (ret) + goto out; + + xattr = gf_strdup (xattr); + + (void)snprintf (xattr_cky, 1024, "%s-%ld", + local->cont.getxattr.name, cky); + ret = dict_set_dynstr (local->dict, + xattr_cky, xattr); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Cannot set xattr cookie key"); + goto out; + } + + local->cont.getxattr.xattr_len + += strlen (xattr) + 1; + } + } +out: + UNLOCK (&frame->lock); + + if (!callcnt) { + if (!local->cont.getxattr.xattr_len) + goto unwind; + + nxattr = dict_new (); + if (!nxattr) + goto unwind; + + /* extra bytes for decorations (brackets and <>'s) */ + padding += strlen (this->name) + + strlen (AFR_PATHINFO_HEADER) + 4; + local->cont.getxattr.xattr_len += (padding + 2); + + xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len, + sizeof (char), gf_common_mt_char); + + if (!xattr_serz) + goto unwind; + + /* the xlator info */ + (void) sprintf (xattr_serz, "(<"AFR_PATHINFO_HEADER"%s> ", + this->name); + + /* actual series of pathinfo */ + ret = dict_serialize_value_with_delim (local->dict, + xattr_serz + + strlen (xattr_serz), + &tlen, ' '); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Error serializing" + " dictionary"); + goto unwind; + } + + /* closing part */ + *(xattr_serz + padding + tlen) = ')'; + *(xattr_serz + padding + tlen + 1) = '\0'; + + ret = dict_set_dynstr (nxattr, local->cont.getxattr.name, + xattr_serz); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo" + " key in dict"); + + unwind: + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, nxattr, + xdata); + + if (nxattr) + dict_unref (nxattr); + } + + return ret; +} + +int32_t afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict) + dict_t *dict, dict_t *xdata) { - afr_local_t *local = NULL; - int32_t callcnt = 0; - int ret = 0; - char *pathinfo = NULL; - char *pathinfo_serz = NULL; - char pathinfo_cky[1024] = {0,}; - dict_t *xattr = NULL; - long cky = 0; - int32_t padding = 0; - int32_t tlen = 0; + afr_local_t *local = NULL; + int32_t callcnt = 0; + int ret = 0; + char *xattr = NULL; + char *xattr_serz = NULL; + char xattr_cky[1024] = {0,}; + dict_t *nxattr = NULL; + long cky = 0; + int32_t padding = 0; + int32_t tlen = 0; if (!frame || !frame->local || !this) { - gf_log (this->name, GF_LOG_ERROR, "possible NULL deref"); + gf_log ("", GF_LOG_ERROR, "possible NULL deref"); goto out; } @@ -681,90 +1249,222 @@ afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, local->dict = dict_new (); if (local->dict) { - ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); + ret = dict_get_str (dict, + local->cont.getxattr.name, + &xattr); if (ret) goto out; - pathinfo = gf_strdup (pathinfo); + xattr = gf_strdup (xattr); - snprintf (pathinfo_cky, 1024, "%s-%ld", GF_XATTR_PATHINFO_KEY, cky); - ret = dict_set_dynstr (local->dict, pathinfo_cky, pathinfo); + (void)snprintf (xattr_cky, 1024, "%s-%ld", + local->cont.getxattr.name, cky); + ret = dict_set_dynstr (local->dict, + xattr_cky, xattr); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo cookie key"); + gf_log (this->name, GF_LOG_ERROR, + "Cannot set xattr cookie key"); goto out; } - local->cont.getxattr.pathinfo_len += strlen (pathinfo) + 1; + local->cont.getxattr.xattr_len += strlen (xattr) + 1; } } out: UNLOCK (&frame->lock); if (!callcnt) { - if (!local->cont.getxattr.pathinfo_len) + if (!local->cont.getxattr.xattr_len) goto unwind; - xattr = dict_new (); - if (!xattr) + nxattr = dict_new (); + if (!nxattr) goto unwind; /* extra bytes for decorations (brackets and <>'s) */ - padding = strlen (this->name) + strlen (AFR_PATHINFO_HEADER) + 4; - local->cont.getxattr.pathinfo_len += (padding + 2); + padding += strlen (this->name) + strlen (AFR_PATHINFO_HEADER) + 4; + local->cont.getxattr.xattr_len += (padding + 2); - pathinfo_serz = GF_CALLOC (local->cont.getxattr.pathinfo_len, sizeof (char), - gf_common_mt_char); + xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len, + sizeof (char), gf_common_mt_char); - if (!pathinfo_serz) + if (!xattr_serz) goto unwind; /* the xlator info */ - sprintf (pathinfo_serz, "(<"AFR_PATHINFO_HEADER"%s> ", this->name); + (void) sprintf (xattr_serz, "(<"AFR_PATHINFO_HEADER"%s> ", + this->name); /* actual series of pathinfo */ - ret = dict_serialize_value_with_delim (local->dict, pathinfo_serz + strlen (pathinfo_serz), + ret = dict_serialize_value_with_delim (local->dict, + xattr_serz + strlen (xattr_serz), &tlen, ' '); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Error serializing dictionary"); + gf_log (this->name, GF_LOG_ERROR, "Error serializing" + " dictionary"); goto unwind; } /* closing part */ - *(pathinfo_serz + padding + tlen) = ')'; - *(pathinfo_serz + padding + tlen + 1) = '\0'; + *(xattr_serz + padding + tlen) = ')'; + *(xattr_serz + padding + tlen + 1) = '\0'; - ret = dict_set_dynstr (xattr, GF_XATTR_PATHINFO_KEY, pathinfo_serz); + ret = dict_set_dynstr (nxattr, local->cont.getxattr.name, + xattr_serz); if (ret) - gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo key in dict"); + gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo" + " key in dict"); unwind: - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr); - - if (local->dict) - dict_unref (local->dict); + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, nxattr, + xdata); - if (xattr) - dict_unref (xattr); + if (nxattr) + dict_unref (nxattr); } return ret; } +static int +afr_aggregate_stime_xattr (dict_t *this, char *key, data_t *value, void *data) +{ + int ret = 0; + + if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) + ret = gf_get_min_stime (THIS, data, key, value); + + return ret; +} + int32_t -afr_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) +afr_common_getxattr_stime_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t callcnt = 0; + + if (!frame || !frame->local || !this) { + gf_log ("", GF_LOG_ERROR, "possible NULL deref"); + goto out; + } + + local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (!dict || (op_ret < 0)) { + local->op_errno = op_errno; + goto cleanup; + } + + if (!local->dict) + local->dict = dict_copy_with_ref (dict, NULL); + else + dict_foreach (dict, afr_aggregate_stime_xattr, + local->dict); + local->op_ret = 0; + } + +cleanup: + UNLOCK (&frame->lock); + + if (!callcnt) { + AFR_STACK_UNWIND (getxattr, frame, local->op_ret, + local->op_errno, local->dict, xdata); + } + +out: + return 0; +} + + +static gf_boolean_t +afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk, + gf_boolean_t is_fgetxattr) +{ + gf_boolean_t is_spl = _gf_true; + + GF_ASSERT (cbk); + if (!cbk) { + is_spl = _gf_false; + goto out; + } + + if (!strcmp (name, GF_XATTR_PATHINFO_KEY)) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_pathinfo_cbk; + } else { + *cbk = afr_getxattr_pathinfo_cbk; + } + } else if (!strncmp (name, GF_XATTR_CLRLK_CMD, + strlen (GF_XATTR_CLRLK_CMD))) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_clrlk_cbk; + } else { + *cbk = afr_getxattr_clrlk_cbk; + } + } else if (!strncmp (name, GF_XATTR_LOCKINFO_KEY, + strlen (GF_XATTR_LOCKINFO_KEY))) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_lockinfo_cbk; + } else { + *cbk = afr_getxattr_lockinfo_cbk; + } + } else if (fnmatch (GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) { + *cbk = afr_common_getxattr_stime_cbk; + } else { + is_spl = _gf_false; + } + +out: + return is_spl; +} + +static void +afr_getxattr_frm_all_children (xlator_t *this, call_frame_t *frame, + const char *name, loc_t *loc, + fop_getxattr_cbk_t cbk) { - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - xlator_list_t *trav = NULL; - xlator_t **sub_volumes = NULL; - int i = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - int32_t read_child = -1; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t **children = NULL; + int i = 0; + priv = this->private; + children = priv->children; + + local = frame->local; + local->call_count = priv->child_count; + + for (i = 0; i < priv->child_count; i++) { + STACK_WIND_COOKIE (frame, cbk, + (void *) (long) i, + children[i], children[i]->fops->getxattr, + loc, name, NULL); + } + return; +} + +int32_t +afr_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) +{ + afr_private_t *priv = NULL; + xlator_t **children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + xlator_list_t *trav = NULL; + xlator_t **sub_volumes = NULL; + int i = 0; + int32_t op_errno = 0; + int32_t read_child = -1; + int ret = -1; + fop_getxattr_cbk_t cbk = NULL; + int afr_xtime_gauge[MCNT_MAX] = {0,}; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -775,48 +1475,108 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, children = priv->children; - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; + AFR_SBRAIN_CHECK_LOC (loc, out); - op_ret = AFR_LOCAL_INIT (local, priv); - if (op_ret < 0) { - op_errno = -op_ret; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } loc_copy (&local->loc, loc); - if (name) - local->cont.getxattr.name = gf_strdup (name); + if (!name) + goto no_name; + + local->cont.getxattr.name = gf_strdup (name); + + if (!strncmp (name, AFR_XATTR_PREFIX, + strlen (AFR_XATTR_PREFIX))) { + gf_log (this->name, GF_LOG_INFO, + "%s: no data present for key %s", + loc->path, name); + op_errno = ENODATA; + goto out; + } + if ((strcmp (GF_XATTR_MARKER_KEY, name) == 0) + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { + local->marker.call_count = priv->child_count; + + sub_volumes = alloca ( priv->child_count * sizeof (xlator_t *)); + for (i = 0, trav = this->children; trav ; + trav = trav->next, i++) { + + *(sub_volumes + i) = trav->xlator; + } + + if (cluster_getmarkerattr (frame, this, loc, name, + local, afr_getxattr_unwind, + sub_volumes, + priv->child_count, + MARKER_UUID_TYPE, + marker_uuid_default_gauge, + priv->vol_uuid)) { - if (name) { - if (!strncmp (name, AFR_XATTR_PREFIX, - strlen (AFR_XATTR_PREFIX))) { gf_log (this->name, GF_LOG_INFO, - "%s: no data present for key %s", + "%s: failed to get marker attr (%s)", loc->path, name); - op_errno = ENODATA; + op_errno = EINVAL; goto out; } - if ((strcmp (GF_XATTR_MARKER_KEY, name) == 0) - && (-1 == frame->root->pid)) { + return 0; + } + + /* + * if we are doing getxattr with pathinfo as the key then we + * collect information from all childs + */ + if (afr_is_special_xattr (name, &cbk, 0)) { + afr_getxattr_frm_all_children (this, frame, name, + loc, cbk); + return 0; + } + + if (XATTR_IS_NODE_UUID (name)) { + i = 0; + STACK_WIND_COOKIE (frame, afr_getxattr_node_uuid_cbk, + (void *) (long) i, + children[i], + children[i]->fops->getxattr, + loc, name, xdata); + return 0; + } + + if (*priv->vol_uuid) { + if ((match_uuid_local (name, priv->vol_uuid) == 0) + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { local->marker.call_count = priv->child_count; - sub_volumes = alloca ( priv->child_count * sizeof (xlator_t *)); + sub_volumes = alloca ( priv->child_count + * sizeof (xlator_t *)); for (i = 0, trav = this->children; trav ; trav = trav->next, i++) { *(sub_volumes + i) = trav->xlator; + } - if (cluster_getmarkerattr (frame, this, loc, name, - local, afr_getxattr_unwind, + /* don't err out on getting ENOTCONN (brick down) + * from a subset of the bricks + */ + memcpy (afr_xtime_gauge, marker_xtime_default_gauge, + sizeof (afr_xtime_gauge)); + afr_xtime_gauge[MCNT_NOTFOUND] = 0; + afr_xtime_gauge[MCNT_ENOTCONN] = 0; + if (cluster_getmarkerattr (frame, this, loc, + name, local, + afr_getxattr_unwind, sub_volumes, priv->child_count, - MARKER_UUID_TYPE, + MARKER_XTIME_TYPE, + afr_xtime_gauge, priv->vol_uuid)) { - gf_log (this->name, GF_LOG_INFO, "%s: failed to get marker attr (%s)", loc->path, name); @@ -826,65 +1586,187 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, return 0; } + } - /* - * if we are doing getxattr with pathinfo as the key then we - * collect information from all childs - */ - if (strncmp (name, GF_XATTR_PATHINFO_KEY, - strlen (GF_XATTR_PATHINFO_KEY)) == 0) { - - local->call_count = priv->child_count; - for (i = 0; i < priv->child_count; i++) { - STACK_WIND_COOKIE (frame, afr_getxattr_pathinfo_cbk, - (void *) (long) i, - children[i], children[i]->fops->getxattr, - loc, name); - } +no_name: + local->fresh_children = afr_children_create (priv->child_count); + if (!local->fresh_children) { + op_errno = ENOMEM; + goto out; + } - return 0; - } + read_child = afr_inode_get_read_ctx (this, loc->inode, + local->fresh_children); + ret = afr_get_call_child (this, local->child_up, read_child, + local->fresh_children, + &call_child, + &local->cont.getxattr.last_index); + if (ret < 0) { + op_errno = -ret; + goto out; + } - if (*priv->vol_uuid) { - if ((match_uuid_local (name, priv->vol_uuid) == 0) - && (-1 == frame->root->pid)) { + STACK_WIND_COOKIE (frame, afr_getxattr_cbk, + (void *) (long) call_child, + children[call_child], + children[call_child]->fops->getxattr, + loc, name, xdata); - local->marker.call_count = priv->child_count; + ret = 0; +out: + if (ret < 0) + AFR_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); + return 0; +} - sub_volumes = alloca ( priv->child_count * sizeof (xlator_t *)); - for (i = 0, trav = this->children; trav ; - trav = trav->next, i++) { +/* {{{ fgetxattr */ - *(sub_volumes + i) = trav->xlator; - } +int32_t +afr_fgetxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + int unwind = 1; + int32_t *last_index = NULL; + int32_t next_call_child = -1; + int32_t read_child = -1; + int32_t *fresh_children = NULL; - if (cluster_getmarkerattr (frame, this, loc, - name, local, - afr_getxattr_unwind, - sub_volumes, - priv->child_count, - MARKER_XTIME_TYPE, - priv->vol_uuid)) { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to get marker attr (%s)", - loc->path, name); - op_errno = EINVAL; - goto out; - } + priv = this->private; + children = priv->children; - return 0; - } - } + local = frame->local; + + read_child = (long) cookie; + + if (op_ret == -1) { + last_index = &local->cont.getxattr.last_index; + fresh_children = local->fresh_children; + next_call_child = afr_next_call_child (fresh_children, + local->child_up, + priv->child_count, + last_index, read_child); + if (next_call_child < 0) + goto out; + + unwind = 0; + STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, + (void *) (long) read_child, + children[next_call_child], + children[next_call_child]->fops->fgetxattr, + local->fd, + local->cont.getxattr.name, + NULL); + } + +out: + if (unwind) { + if (op_ret >= 0 && dict) + __filter_xattrs (dict); + + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, + xdata); } + return 0; +} + +int32_t +afr_fgetxattr_unwind (call_frame_t *frame, + int op_ret, int op_errno, dict_t *dict, dict_t *xdata) + +{ + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + +static void +afr_fgetxattr_frm_all_children (xlator_t *this, call_frame_t *frame, + const char *name, fd_t *fd, + fop_fgetxattr_cbk_t cbk) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t **children = NULL; + int i = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + local->call_count = priv->child_count; + + for (i = 0; i < priv->child_count; i++) { + STACK_WIND_COOKIE (frame, cbk, + (void *) (long) i, + children[i], children[i]->fops->fgetxattr, + fd, name, NULL); + } + + return; +} + +int32_t +afr_fgetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + afr_private_t *priv = NULL; + xlator_t **children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + int32_t read_child = -1; + fop_fgetxattr_cbk_t cbk = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + AFR_SBRAIN_CHECK_FD (fd, out); + + AFR_LOCAL_ALLOC_OR_GOTO (local, out); + frame->local = local; + + op_ret = afr_local_init (local, priv, &op_errno); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + local->fd = fd_ref (fd); + if (name) + local->cont.getxattr.name = gf_strdup (name); + + /* pathinfo gets handled only in getxattr(), but we need to handle + * lockinfo. + * If we are doing fgetxattr with lockinfo as the key then we + * collect information from all children. + */ + if (afr_is_special_xattr (name, &cbk, 1)) { + afr_fgetxattr_frm_all_children (this, frame, name, + fd, cbk); + return 0; + } + + local->fresh_children = afr_children_create (priv->child_count); if (!local->fresh_children) { op_errno = ENOMEM; goto out; } - read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); + read_child = afr_inode_get_read_ctx (this, fd->inode, + local->fresh_children); op_ret = afr_get_call_child (this, local->child_up, read_child, local->fresh_children, &call_child, @@ -895,16 +1777,17 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, goto out; } - STACK_WIND_COOKIE (frame, afr_getxattr_cbk, + STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, (void *) (long) call_child, children[call_child], - children[call_child]->fops->getxattr, - loc, name); + children[call_child]->fops->fgetxattr, + fd, name, xdata); op_ret = 0; out: if (op_ret == -1) { - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, NULL); + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, NULL, + NULL); } return 0; } @@ -930,7 +1813,7 @@ int32_t afr_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, int32_t count, struct iatt *buf, - struct iobref *iobref) + struct iobref *iobref, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; @@ -971,13 +1854,15 @@ afr_readv_cbk (call_frame_t *frame, void *cookie, children[next_call_child], children[next_call_child]->fops->readv, local->fd, local->cont.readv.size, - local->cont.readv.offset); + local->cont.readv.offset, + local->cont.readv.flags, + NULL); } out: if (unwind) { AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, - vector, count, buf, iobref); + vector, count, buf, iobref, xdata); } return 0; @@ -986,15 +1871,15 @@ out: int32_t afr_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset) + fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; xlator_t ** children = NULL; int call_child = 0; - int32_t op_ret = -1; int32_t op_errno = 0; int32_t read_child = -1; + int ret = -1; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -1004,13 +1889,14 @@ afr_readv (call_frame_t *frame, xlator_t *this, priv = this->private; children = priv->children; - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; - op_ret = AFR_LOCAL_INIT (local, priv); - if (op_ret < 0) { - op_errno = -op_ret; + AFR_SBRAIN_CHECK_FD (fd, out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } local->fresh_children = afr_children_create (priv->child_count); if (!local->fresh_children) { @@ -1019,13 +1905,12 @@ afr_readv (call_frame_t *frame, xlator_t *this, } read_child = afr_inode_get_read_ctx (this, fd->inode, local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, + ret = afr_get_call_child (this, local->child_up, read_child, local->fresh_children, &call_child, &local->cont.readv.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; + if (ret < 0) { + op_errno = -ret; goto out; } @@ -1033,24 +1918,21 @@ afr_readv (call_frame_t *frame, xlator_t *this, local->cont.readv.size = size; local->cont.readv.offset = offset; + local->cont.readv.flags = flags; + + afr_open_fd_fix (fd, this); - op_ret = afr_open_fd_fix (frame, this, _gf_false); - if (op_ret) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } STACK_WIND_COOKIE (frame, afr_readv_cbk, (void *) (long) call_child, children[call_child], children[call_child]->fops->readv, - fd, size, offset); + fd, size, offset, flags, xdata); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, NULL, 0, NULL, - NULL); + if (ret < 0) { + AFR_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, + NULL, NULL); } return 0; } diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h index 5479cfbd5..e4091a793 100644 --- a/xlators/cluster/afr/src/afr-inode-read.h +++ b/xlators/cluster/afr/src/afr-inode-read.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __INODE_READ_H__ @@ -22,26 +13,30 @@ int32_t afr_access (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t mask); + loc_t *loc, int32_t mask, dict_t *xdata); int32_t afr_stat (call_frame_t *frame, xlator_t *this, - loc_t *loc); + loc_t *loc, dict_t *xdata); int32_t afr_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd); + fd_t *fd, dict_t *xdata); int32_t afr_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size); + loc_t *loc, size_t size, dict_t *xdata); int32_t afr_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset); + fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata); int32_t afr_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name); + loc_t *loc, const char *name, dict_t *xdata); + +int32_t +afr_fgetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata); #endif /* __INODE_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 3f78c7b36..c1ec69a55 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -48,45 +39,151 @@ #include "afr-transaction.h" #include "afr-self-heal-common.h" +void +__inode_write_fop_cbk (call_frame_t *frame, int child_index, int read_child, + xlator_t *this, int32_t *op_ret, int32_t *op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (afr_fop_failed (*op_ret, *op_errno)) { + local->child_errno[child_index] = *op_errno; + + switch (local->op) { + case GF_FOP_TRUNCATE: + case GF_FOP_FTRUNCATE: + if (*op_errno != EFBIG) + afr_transaction_fop_failed (frame, this, + child_index); + break; + default: + afr_transaction_fop_failed (frame, this, child_index); + break; + } + local->op_errno = *op_errno; + goto out; + } + + if ((local->success_count == 0) || (read_child == child_index)) { + local->op_ret = *op_ret; + if (prebuf) + local->cont.inode_wfop.prebuf = *prebuf; + if (postbuf) + local->cont.inode_wfop.postbuf = *postbuf; + } + + local->success_count++; +out: + return; +} + /* {{{ writev */ -int +void +afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame) +{ + afr_local_t *src_local = NULL; + afr_local_t *dst_local = NULL; + + src_local = src_frame->local; + dst_local = dst_frame->local; + + dst_local->op_ret = src_local->op_ret; + dst_local->op_errno = src_local->op_errno; + dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf; + dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf; +} + +void afr_writev_unwind (call_frame_t *frame, xlator_t *this) { afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + local = frame->local; + + AFR_STACK_UNWIND (writev, frame, + local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); +} + +call_frame_t* +afr_transaction_detach_fop_frame (call_frame_t *frame) +{ + afr_local_t * local = NULL; + call_frame_t *fop_frame = NULL; local = frame->local; LOCK (&frame->lock); { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; + fop_frame = local->transaction.main_frame; local->transaction.main_frame = NULL; } UNLOCK (&frame->lock); - if (main_frame) { - AFR_STACK_UNWIND (writev, main_frame, - local->op_ret, local->op_errno, - &local->cont.writev.prebuf, - &local->cont.writev.postbuf); + return fop_frame; +} + +int +afr_transaction_writev_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *fop_frame = NULL; + + fop_frame = afr_transaction_detach_fop_frame (frame); + + if (fop_frame) { + afr_writev_copy_outvars (frame, fop_frame); + afr_writev_unwind (fop_frame, this); } return 0; } +static void +afr_writev_handle_short_writes (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + /* + * We already have the best case result of the writev calls staged + * as the return value. Any writev that returns some value less + * than the best case is now out of sync, so mark the fop as + * failed. Note that fops that have returned with errors have + * already been marked as failed. + */ + for (i = 0; i < priv->child_count; i++) { + if ((!local->replies[i].valid) || + (local->replies[i].op_ret == -1)) + continue; + + if (local->replies[i].op_ret < local->op_ret) + afr_transaction_fop_failed(frame, this, i); + } +} int afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = NULL; + afr_private_t *priv = NULL; + call_frame_t *fop_frame = NULL; int child_index = (long) cookie; int call_count = -1; int read_child = 0; + int ret = 0; + uint32_t open_fd_count = 0; + uint32_t write_is_append = 0; local = frame->local; + priv = this->private; read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); @@ -96,32 +193,81 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.writev.prebuf = *prebuf; - local->cont.writev.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.writev.prebuf = *prebuf; - local->cont.writev.postbuf = *postbuf; + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + + + /* stage the best case return value for unwind */ + if ((local->success_count == 0) || (op_ret > local->op_ret)) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + + if (op_ret != -1) { + if (xdata) { + ret = dict_get_uint32 (xdata, + GLUSTERFS_OPEN_FD_COUNT, + &open_fd_count); + if ((ret == 0) && + (open_fd_count > local->open_fd_count)) { + local->open_fd_count = open_fd_count; + local->update_open_fd_count = _gf_true; + } + + write_is_append = 0; + ret = dict_get_uint32 (xdata, + GLUSTERFS_WRITE_IS_APPEND, + &write_is_append); + if (ret || !write_is_append) + local->append_write = _gf_false; } - } - local->op_errno = op_errno; + } } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) { - local->transaction.unwind (frame, this); - local->transaction.resume (frame, this); + if (local->update_open_fd_count) + afr_handle_open_fd_count (frame, this); + + if (!local->stable_write && !local->append_write) + /* An appended write removes the necessity to + fsync() the file. This is because self-heal + has the logic to check for larger file when + the xattrs are not reliably pointing at + a stale file. + */ + afr_fd_report_unstable_write (this, local->fd); + + afr_writev_handle_short_writes (frame, this); + if (afr_any_fops_failed (local, priv)) { + //Don't unwind until post-op is complete + local->transaction.resume (frame, this); + } else { + /* + * Generally inode-write fops do transaction.unwind then + * transaction.resume, but writev needs to make sure that + * delayed post-op frame is placed in fdctx before unwind + * happens. This prevents the race of flush doing the + * changelog wakeup first in fuse thread and then this + * writev placing its delayed post-op frame in fdctx. + * This helps flush make sure all the delayed post-ops are + * completed. + */ + + fop_frame = afr_transaction_detach_fop_frame (frame); + afr_writev_copy_outvars (frame, fop_frame); + local->transaction.resume (frame, this); + afr_writev_unwind (fop_frame, this); + } } return 0; } @@ -133,6 +279,8 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this) afr_private_t *priv = NULL; int i = 0; int call_count = -1; + dict_t *xdata = NULL; + GF_UNUSED int ret = 0; local = frame->local; priv = this->private; @@ -146,6 +294,28 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this) } local->call_count = call_count; + local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), + gf_afr_mt_reply_t); + if (!local->replies) { + local->op_ret = -1; + local->op_errno = ENOMEM; + local->transaction.unwind(frame, this); + local->transaction.resume(frame, this); + return 0; + } + + xdata = dict_new (); + if (xdata) { + ret = dict_set_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, + sizeof (uint32_t)); + ret = dict_set_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND, + 0); + /* Set append_write to be true speculatively. If on any + server it turns not be true, we unset it in the + callback. + */ + local->append_write = _gf_true; + } for (i = 0; i < priv->child_count; i++) { if (local->transaction.pre_op[i]) { @@ -157,13 +327,18 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this) local->cont.writev.vector, local->cont.writev.count, local->cont.writev.offset, - local->cont.writev.iobref); + local->cont.writev.flags, + local->cont.writev.iobref, + xdata); if (!--call_count) break; } } + if (xdata) + dict_unref (xdata); + return 0; } @@ -203,7 +378,7 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) } transaction_frame->local = local; - frame->local = NULL; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local->op = GF_FOP_WRITE; @@ -211,10 +386,17 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) local->transaction.fop = afr_writev_wind; local->transaction.done = afr_writev_done; - local->transaction.unwind = afr_writev_unwind; + local->transaction.unwind = afr_transaction_writev_unwind; local->transaction.main_frame = frame; if (local->fd->flags & O_APPEND) { + /* + * Backend vfs ignores the 'offset' for append mode fd so + * locking just the region provided for the writev does not + * give consistency gurantee. The actual write may happen at a + * completely different range than the one provided by the + * offset, len in the fop. So lock the entire file. + */ local->transaction.start = 0; local->transaction.len = 0; } else { @@ -223,156 +405,91 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) local->cont.writev.count); } - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } op_ret = 0; out: - if (op_ret == -1) { + if (op_ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL); + AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL, NULL); } return 0; } -static int -afr_prepare_loc (call_frame_t *frame, fd_t *fd) +static void +afr_trigger_open_fd_self_heal (fd_t *fd, xlator_t *this) { - afr_local_t *local = NULL; - char *name = NULL; - char *path = NULL; - int ret = 0; - - if ((!fd) || (!fd->inode)) - return -1; - - local = frame->local; - ret = inode_path (fd->inode, NULL, (char **)&path); - if (ret <= 0) { - gf_log (frame->this->name, GF_LOG_DEBUG, - "Unable to get path for gfid: %s", - uuid_utoa (fd->inode->gfid)); - return -1; - } - - if (local->loc.path) { - if (strcmp (path, local->loc.path)) - gf_log (frame->this->name, GF_LOG_DEBUG, - "overwriting old loc->path %s with %s", - local->loc.path, path); - GF_FREE ((char *)local->loc.path); - } - local->loc.path = path; - - name = strrchr (local->loc.path, '/'); - if (name) - name++; - local->loc.name = name; - - if (local->loc.inode) { - inode_unref (local->loc.inode); - } - local->loc.inode = inode_ref (fd->inode); - - if (local->loc.parent) { - inode_unref (local->loc.parent); + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + char *reason = NULL; + int32_t op_errno = 0; + int ret = 0; + + if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid args: " + "fd: %p, inode: %p", fd, + fd ? fd->inode : NULL); + goto out; } - local->loc.parent = inode_parent (local->loc.inode, 0, NULL); - - return 0; -} - -afr_fd_paused_call_t* -afr_paused_call_create (call_frame_t *frame) -{ - afr_local_t *local = NULL; - afr_fd_paused_call_t *paused_call = NULL; + frame = create_frame (this, this->ctx->pool); + if (!frame) + goto out; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; - GF_ASSERT (local->fop_call_continue); - - paused_call = GF_CALLOC (1, sizeof (*paused_call), - gf_afr_fd_paused_call_t); - if (paused_call) { - INIT_LIST_HEAD (&paused_call->call_list); - paused_call->frame = frame; - } - - return paused_call; -} - -static int -afr_pause_fd_fop (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx) -{ - afr_fd_paused_call_t *paused_call = NULL; - int ret = 0; - - paused_call = afr_paused_call_create (frame); - if (paused_call) - list_add (&paused_call->call_list, &fd_ctx->paused_calls); - else - ret = -ENOMEM; - - return ret; -} + ret = afr_local_init (local, this->private, &op_errno); + if (ret < 0) + goto out; -static void -afr_trigger_open_fd_self_heal (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - inode_t *inode = NULL; - char *reason = NULL; + local->loc.inode = inode_ref (fd->inode); + ret = loc_path (&local->loc, NULL); + if (ret < 0) + goto out; - local = frame->local; sh = &local->self_heal; - inode = local->fd->inode; - - sh->do_missing_entry_self_heal = _gf_true; - sh->do_gfid_self_heal = _gf_true; - sh->do_data_self_heal = _gf_true; + sh->do_metadata_self_heal = _gf_true; + if (fd->inode->ia_type == IA_IFREG) + sh->do_data_self_heal = _gf_true; + else if (fd->inode->ia_type == IA_IFDIR) + sh->do_entry_self_heal = _gf_true; reason = "subvolume came online"; - afr_launch_self_heal (frame, this, inode, _gf_true, inode->ia_type, - reason, NULL, NULL); + afr_launch_self_heal (frame, this, fd->inode, _gf_true, + fd->inode->ia_type, reason, NULL, NULL); + return; +out: + AFR_STACK_DESTROY (frame); } -int -afr_open_fd_fix (call_frame_t *frame, xlator_t *this, gf_boolean_t pause_fop) -{ - int ret = 0; - int i = 0; - afr_fd_ctx_t *fd_ctx = NULL; - gf_boolean_t need_self_heal = _gf_false; - int *need_open = NULL; - int need_open_count = 0; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - gf_boolean_t fop_continue = _gf_true; +void +afr_open_fd_fix (fd_t *fd, xlator_t *this) +{ + int ret = 0; + int i = 0; + afr_fd_ctx_t *fd_ctx = NULL; + gf_boolean_t need_self_heal = _gf_false; + int *need_open = NULL; + size_t need_open_count = 0; + afr_private_t *priv = NULL; - local = frame->local; priv = this->private; - GF_ASSERT (local->fd); - if (pause_fop) - GF_ASSERT (local->fop_call_continue); - - ret = afr_prepare_loc (frame, local->fd); - if (ret < 0) { - //File does not exist we cant open it. - ret = 0; + if (!afr_is_fd_fixable (fd)) goto out; - } - fd_ctx = afr_fd_ctx_get (local->fd, this); - if (!fd_ctx) { - ret = -EINVAL; - goto unlock; - } + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + goto out; - LOCK (&local->fd->lock); + LOCK (&fd->lock); { if (fd_ctx->up_count < priv->up_count) { need_self_heal = _gf_true; @@ -380,67 +497,44 @@ afr_open_fd_fix (call_frame_t *frame, xlator_t *this, gf_boolean_t pause_fop) fd_ctx->down_count = priv->down_count; } + need_open = alloca (priv->child_count * sizeof (*need_open)); for (i = 0; i < priv->child_count; i++) { - if ((fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED) && - local->child_up[i]) { - fd_ctx->opened_on[i] = AFR_FD_OPENING; - if (!need_open) - need_open = GF_CALLOC (priv->child_count, - sizeof (*need_open), - gf_afr_mt_int32_t); - need_open[i] = 1; - need_open_count++; - } else if (pause_fop && local->child_up[i] && - (fd_ctx->opened_on[i] == AFR_FD_OPENING)) { - local->fop_paused = _gf_true; - } - } + need_open[i] = 0; + if (fd_ctx->opened_on[i] != AFR_FD_NOT_OPENED) + continue; - if (local->fop_paused) { - GF_ASSERT (pause_fop); - gf_log (this->name, GF_LOG_INFO, "Pause fd %p", - local->fd); - ret = afr_pause_fd_fop (frame, this, fd_ctx); - if (ret) - goto unlock; - fop_continue = _gf_false; + if (!priv->child_up[i]) + continue; + + fd_ctx->opened_on[i] = AFR_FD_OPENING; + + need_open[i] = 1; + need_open_count++; } } -unlock: - UNLOCK (&local->fd->lock); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Failed to fix fd for %s", - local->loc.path); - fop_continue = _gf_false; + UNLOCK (&fd->lock); + if (ret) goto out; - } if (need_self_heal) - afr_trigger_open_fd_self_heal (frame, this); + afr_trigger_open_fd_self_heal (fd, this); if (!need_open_count) goto out; - gf_log (this->name, GF_LOG_INFO, "Opening fd %p", local->fd); - afr_fix_open (frame, this, fd_ctx, need_open_count, need_open); - fop_continue = _gf_false; + afr_fix_open (this, fd, need_open_count, need_open); out: - if (need_open) - GF_FREE (need_open); - if (fop_continue && local->fop_call_continue) - local->fop_call_continue (frame, this); - return ret; + return; } int afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref) + uint32_t flags, struct iobref *iobref, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; int ret = -1; - int op_ret = -1; int op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -449,35 +543,41 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; goto out; } - frame->local = local; + QUORUM_CHECK(writev,out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; local->cont.writev.vector = iov_dup (vector, count); local->cont.writev.count = count; local->cont.writev.offset = offset; + local->cont.writev.flags = flags; local->cont.writev.iobref = iobref_ref (iobref); local->fd = fd_ref (fd); - local->fop_call_continue = afr_do_writev; - ret = afr_open_fd_fix (frame, this, _gf_true); - if (ret) { - op_errno = -ret; - goto out; - } + /* detect here, but set it in writev_wind_cbk *after* the unstable + write is performed + */ + local->stable_write = !!((fd->flags|flags)&(O_SYNC|O_DSYNC)); - op_ret = 0; + afr_open_fd_fix (fd, this); + + afr_do_writev (frame, this); + + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -506,8 +606,9 @@ afr_truncate_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, local->op_errno, - &local->cont.truncate.prebuf, - &local->cont.truncate.postbuf); + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); } return 0; @@ -517,17 +618,14 @@ afr_truncate_unwind (call_frame_t *frame, xlator_t *this) int afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = NULL; - afr_private_t * priv = NULL; int child_index = (long) cookie; int read_child = 0; int call_count = -1; - int need_unwind = 0; local = frame->local; - priv = this->private; read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); @@ -537,38 +635,22 @@ afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno) && op_errno != EFBIG) - afr_transaction_fop_failed (frame, this, child_index); - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.truncate.prebuf = *prebuf; - local->cont.truncate.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.truncate.prebuf = *prebuf; - local->cont.truncate.postbuf = *postbuf; - } - - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if (prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; } - local->op_errno = op_errno; + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); } UNLOCK (&frame->lock); - if (need_unwind) - local->transaction.unwind (frame, this); - call_count = afr_frame_return (frame); if (call_count == 0) { + if (local->stable_write && afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); + local->transaction.resume (frame, this); } @@ -596,6 +678,7 @@ afr_truncate_wind (call_frame_t *frame, xlator_t *this) } local->call_count = call_count; + local->stable_write = _gf_true; for (i = 0; i < priv->child_count; i++) { if (local->transaction.pre_op[i]) { @@ -604,7 +687,8 @@ afr_truncate_wind (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->truncate, &local->loc, - local->cont.truncate.offset); + local->cont.truncate.offset, + NULL); if (!--call_count) break; @@ -632,13 +716,12 @@ afr_truncate_done (call_frame_t *frame, xlator_t *this) int afr_truncate (call_frame_t *frame, xlator_t *this, - loc_t *loc, off_t offset) + loc_t *loc, off_t offset, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_ret = -1; int op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -647,22 +730,20 @@ afr_truncate (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(truncate,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; - - local->op_ret = -1; local->cont.truncate.offset = offset; @@ -676,14 +757,18 @@ afr_truncate (call_frame_t *frame, xlator_t *this, local->transaction.start = offset; local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (truncate, frame, op_ret, op_errno, NULL, NULL); + AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); } return 0; @@ -714,8 +799,9 @@ afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, local->op_errno, - &local->cont.ftruncate.prebuf, - &local->cont.ftruncate.postbuf); + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); } return 0; } @@ -724,17 +810,14 @@ afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) int afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = NULL; - afr_private_t * priv = NULL; int child_index = (long) cookie; int call_count = -1; - int need_unwind = 0; int read_child = 0; local = frame->local; - priv = this->private; read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); @@ -744,38 +827,22 @@ afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.ftruncate.prebuf = *prebuf; - local->cont.ftruncate.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.ftruncate.prebuf = *prebuf; - local->cont.ftruncate.postbuf = *postbuf; - } - - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if (prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; } - local->op_errno = op_errno; + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); } UNLOCK (&frame->lock); - if (need_unwind) - local->transaction.unwind (frame, this); - call_count = afr_frame_return (frame); if (call_count == 0) { + if (local->stable_write && afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); + local->transaction.resume (frame, this); } @@ -803,6 +870,7 @@ afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) } local->call_count = call_count; + local->stable_write = _gf_true; for (i = 0; i < priv->child_count; i++) { if (local->transaction.pre_op[i]) { @@ -810,7 +878,9 @@ afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->ftruncate, - local->fd, local->cont.ftruncate.offset); + local->fd, + local->cont.ftruncate.offset, + NULL); if (!--call_count) break; @@ -865,14 +935,19 @@ afr_do_ftruncate (call_frame_t *frame, xlator_t *this) local->transaction.start = local->cont.ftruncate.offset; local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } op_ret = 0; out: - if (op_ret == -1) { + if (op_ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, NULL); + AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, + NULL, NULL); } return 0; @@ -881,13 +956,12 @@ out: int afr_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset) + fd_t *fd, off_t offset, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_ret = -1; int op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -896,33 +970,33 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - ret = AFR_LOCAL_INIT (local, priv); - - if (ret < 0) { - op_errno = -ret; + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; goto out; } + QUORUM_CHECK(ftruncate,out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - frame->local = local; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; local->cont.ftruncate.offset = offset; local->fd = fd_ref (fd); - local->fop_call_continue = afr_do_ftruncate; - ret = afr_open_fd_fix (frame, this, _gf_true); - if (ret) { - op_errno = -ret; - goto out; - } + afr_open_fd_fix (fd, this); - op_ret = 0; + afr_do_ftruncate (frame, this); + + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, NULL); + AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); } return 0; @@ -951,8 +1025,9 @@ afr_setattr_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, local->op_errno, - &local->cont.setattr.preop_buf, - &local->cont.setattr.postop_buf); + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); } return 0; @@ -962,7 +1037,7 @@ afr_setattr_unwind (call_frame_t *frame, xlator_t *this) int afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) + struct iatt *preop, struct iatt *postop, dict_t *xdata) { afr_local_t * local = NULL; afr_private_t * priv = NULL; @@ -982,29 +1057,14 @@ afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.setattr.preop_buf = *preop; - local->cont.setattr.postop_buf = *postop; - } - - if (child_index == read_child) { - local->cont.setattr.preop_buf = *preop; - local->cont.setattr.postop_buf = *postop; - } - - local->success_count++; + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, preop, postop, + xdata); - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; } - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1050,7 +1110,8 @@ afr_setattr_wind (call_frame_t *frame, xlator_t *this) priv->children[i]->fops->setattr, &local->loc, &local->cont.setattr.in_buf, - local->cont.setattr.valid); + local->cont.setattr.valid, + NULL); if (!--call_count) break; @@ -1078,13 +1139,12 @@ afr_setattr_done (call_frame_t *frame, xlator_t *this) int afr_setattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct iatt *buf, int32_t valid) + loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_ret = -1; int op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -1093,22 +1153,20 @@ afr_setattr (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(setattr,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; - - local->op_ret = -1; local->cont.setattr.in_buf = *buf; local->cont.setattr.valid = valid; @@ -1123,14 +1181,18 @@ afr_setattr (call_frame_t *frame, xlator_t *this, local->transaction.start = LLONG_MAX - 1; local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setattr, frame, op_ret, op_errno, NULL, NULL); + AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); } return 0; @@ -1157,8 +1219,9 @@ afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, local->op_errno, - &local->cont.fsetattr.preop_buf, - &local->cont.fsetattr.postop_buf); + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); } return 0; @@ -1168,7 +1231,7 @@ afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this) int afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) + struct iatt *preop, struct iatt *postop, dict_t *xdata) { afr_local_t * local = NULL; afr_private_t * priv = NULL; @@ -1188,29 +1251,14 @@ afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.fsetattr.preop_buf = *preop; - local->cont.fsetattr.postop_buf = *postop; - } - - if (child_index == read_child) { - local->cont.fsetattr.preop_buf = *preop; - local->cont.fsetattr.postop_buf = *postop; - } - - local->success_count++; + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, preop, postop, + xdata); - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; } - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1256,7 +1304,8 @@ afr_fsetattr_wind (call_frame_t *frame, xlator_t *this) priv->children[i]->fops->fsetattr, local->fd, &local->cont.fsetattr.in_buf, - local->cont.fsetattr.valid); + local->cont.fsetattr.valid, + NULL); if (!--call_count) break; @@ -1283,13 +1332,12 @@ afr_fsetattr_done (call_frame_t *frame, xlator_t *this) int afr_fsetattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iatt *buf, int32_t valid) + fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_ret = -1; int op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -1298,22 +1346,25 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this, priv = this->private; + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + + QUORUM_CHECK(fsetattr,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); - transaction_frame->local = local; + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - local->op_ret = -1; local->cont.fsetattr.in_buf = *buf; local->cont.fsetattr.valid = valid; @@ -1324,25 +1375,24 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this, local->fd = fd_ref (fd); - op_ret = afr_open_fd_fix (transaction_frame, this, _gf_false); - if (ret) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } + afr_open_fd_fix (fd, this); local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, NULL, NULL); + AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); } return 0; @@ -1370,38 +1420,34 @@ afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (setxattr, main_frame, - local->op_ret, local->op_errno) - } + local->op_ret, local->op_errno, + NULL); + } return 0; } int afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int need_unwind = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; local = frame->local; priv = this->private; LOCK (&frame->lock); { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->child_count) { - need_unwind = 1; - } + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); + if (local->success_count == priv->child_count) { + need_unwind = 1; } - - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1421,10 +1467,10 @@ afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int afr_setxattr_wind (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; local = frame->local; priv = this->private; @@ -1447,7 +1493,8 @@ afr_setxattr_wind (call_frame_t *frame, xlator_t *this) priv->children[i]->fops->setxattr, &local->loc, local->cont.setxattr.dict, - local->cont.setxattr.flags); + local->cont.setxattr.flags, + NULL); if (!--call_count) break; @@ -1461,7 +1508,7 @@ afr_setxattr_wind (call_frame_t *frame, xlator_t *this) int afr_setxattr_done (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = frame->local; + afr_local_t *local = frame->local; local->transaction.unwind (frame, this); @@ -1472,37 +1519,40 @@ afr_setxattr_done (call_frame_t *frame, xlator_t *this) int afr_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags) + loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + int ret = -1; + int op_errno = EINVAL; - VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; + GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, + op_errno, out); - ALLOC_OR_GOTO (local, afr_local_t, out); + GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, + op_errno, out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + QUORUM_CHECK(setxattr,out); transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - transaction_frame->local = local; + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - local->op_ret = -1; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; local->cont.setxattr.dict = dict_ref (dict); local->cont.setxattr.flags = flags; @@ -1517,14 +1567,211 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, local->transaction.start = LLONG_MAX - 1; local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + } + + return 0; +} + +/* {{{ fsetxattr */ + + +int +afr_fsetxattr_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (fsetxattr, main_frame, + local->op_ret, local->op_errno, + NULL); + } + return 0; +} + + +int +afr_fsetxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); + if (local->success_count == priv->child_count) { + need_unwind = 1; + } + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fsetxattr, + local->fd, + local->cont.fsetxattr.dict, + local->cont.fsetxattr.flags, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_fsetxattr_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + +int +afr_fsetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = EINVAL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, + op_errno, out); + + GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, + op_errno, out); + + priv = this->private; + + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + + QUORUM_CHECK(fsetxattr,out); + + AFR_LOCAL_ALLOC_OR_GOTO (local, out); + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + goto out; + } + + transaction_frame->local = local; + + local->op_ret = -1; + + local->cont.fsetxattr.dict = dict_ref (dict); + local->cont.fsetxattr.flags = flags; + + local->transaction.fop = afr_fsetxattr_wind; + local->transaction.done = afr_fsetxattr_done; + local->transaction.unwind = afr_fsetxattr_unwind; + + local->fd = fd_ref (fd); + + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; + + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno); + AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); } return 0; @@ -1532,6 +1779,7 @@ out: /* }}} */ + /* {{{ removexattr */ @@ -1553,38 +1801,34 @@ afr_removexattr_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (removexattr, main_frame, - local->op_ret, local->op_errno) - } + local->op_ret, local->op_errno, + NULL); + } return 0; } int afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int need_unwind = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; local = frame->local; priv = this->private; LOCK (&frame->lock); { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); + if (local->success_count == priv->wait_count) { + need_unwind = 1; } - - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1629,7 +1873,8 @@ afr_removexattr_wind (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->removexattr, &local->loc, - local->cont.removexattr.name); + local->cont.removexattr.name, + NULL); if (!--call_count) break; @@ -1655,7 +1900,192 @@ afr_removexattr_done (call_frame_t *frame, xlator_t *this) int afr_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) + loc_t *loc, const char *name, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (this, out); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", + name, op_errno, out); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", + name, op_errno, out); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + priv = this->private; + + QUORUM_CHECK(removexattr,out); + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + op_errno = ENOMEM; + goto out; + } + + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + local->cont.removexattr.name = gf_strdup (name); + + local->transaction.fop = afr_removexattr_wind; + local->transaction.done = afr_removexattr_done; + local->transaction.unwind = afr_removexattr_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; + + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); + } + + return 0; +} + +/* ffremovexattr */ +int +afr_fremovexattr_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (fremovexattr, main_frame, + local->op_ret, local->op_errno, + NULL); + } + return 0; +} + + +int +afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fremovexattr, + local->fd, + local->cont.removexattr.name, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_fremovexattr_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; @@ -1664,21 +2094,33 @@ afr_removexattr (call_frame_t *frame, xlator_t *this, int op_ret = -1; int op_errno = 0; - VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", + name, op_errno, out); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", + name, op_errno, out); + + VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); priv = this->private; + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + + QUORUM_CHECK(fremovexattr, out); transaction_frame = copy_frame (frame); if (!transaction_frame) { goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (local, out); - ret = AFR_LOCAL_INIT (local, priv); + ret = afr_local_init (local, priv, &op_errno); if (ret < 0) { op_errno = -ret; goto out; @@ -1690,25 +2132,730 @@ afr_removexattr (call_frame_t *frame, xlator_t *this, local->cont.removexattr.name = gf_strdup (name); - local->transaction.fop = afr_removexattr_wind; - local->transaction.done = afr_removexattr_done; - local->transaction.unwind = afr_removexattr_unwind; + local->transaction.fop = afr_fremovexattr_wind; + local->transaction.done = afr_fremovexattr_done; + local->transaction.unwind = afr_fremovexattr_unwind; - loc_copy (&local->loc, loc); + local->fd = fd_ref (fd); local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + op_ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + op_ret = 0; +out: + if (op_ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, NULL); + } + + return 0; +} + +static int +afr_fallocate_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, + local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); + } + return 0; +} + +static int +afr_fallocate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + int read_child = 0; + + local = frame->local; + priv = this->private; + + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + + LOCK (&frame->lock); + { + if (child_index == read_child) { + local->read_child_returned = _gf_true; + } + + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); + + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; + } + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + +static int +afr_fallocate_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fallocate, + local->fd, + local->cont.fallocate.mode, + local->cont.fallocate.offset, + local->cont.fallocate.len, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + +static int +afr_fallocate_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + +static int +afr_do_fallocate (call_frame_t *frame, xlator_t *this) +{ + call_frame_t * transaction_frame = NULL; + afr_local_t * local = NULL; + int op_ret = -1; + int op_errno = 0; + + local = frame->local; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + goto out; + } + + transaction_frame->local = local; + frame->local = NULL; + + local->op = GF_FOP_FALLOCATE; + + local->transaction.fop = afr_fallocate_wind; + local->transaction.done = afr_fallocate_done; + local->transaction.unwind = afr_fallocate_unwind; + + local->transaction.main_frame = frame; + + local->transaction.start = local->cont.fallocate.offset; + local->transaction.len = 0; + + /* fallocate can modify the file size */ + op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } op_ret = 0; out: - if (op_ret == -1) { + if (op_ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (removexattr, frame, op_ret, op_errno); + AFR_STACK_UNWIND (fallocate, frame, op_ret, op_errno, NULL, + NULL, NULL); } return 0; } + +int +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + QUORUM_CHECK(fallocate,out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + local->cont.fallocate.mode = mode; + local->cont.fallocate.offset = offset; + local->cont.fallocate.len = len; + + local->fd = fd_ref (fd); + + afr_open_fd_fix (fd, this); + + afr_do_fallocate (frame, this); + + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ discard */ + +static int +afr_discard_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (discard, main_frame, local->op_ret, + local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); + } + return 0; +} + +static int +afr_discard_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + int read_child = 0; + + local = frame->local; + priv = this->private; + + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + + LOCK (&frame->lock); + { + if (child_index == read_child) { + local->read_child_returned = _gf_true; + } + + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); + + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; + } + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + +static int +afr_discard_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->discard, + local->fd, + local->cont.discard.offset, + local->cont.discard.len, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + +static int +afr_discard_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + +static int +afr_do_discard (call_frame_t *frame, xlator_t *this) +{ + call_frame_t * transaction_frame = NULL; + afr_local_t * local = NULL; + int op_ret = -1; + int op_errno = 0; + + local = frame->local; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + goto out; + } + + transaction_frame->local = local; + frame->local = NULL; + + local->op = GF_FOP_DISCARD; + + local->transaction.fop = afr_discard_wind; + local->transaction.done = afr_discard_done; + local->transaction.unwind = afr_discard_unwind; + + local->transaction.main_frame = frame; + + local->transaction.start = local->cont.discard.offset; + local->transaction.len = 0; + + op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + op_ret = 0; +out: + if (op_ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (discard, frame, op_ret, op_errno, NULL, + NULL, NULL); + } + + return 0; +} + +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + QUORUM_CHECK(discard, out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + local->cont.discard.offset = offset; + local->cont.discard.len = len; + + local->fd = fd_ref (fd); + + afr_open_fd_fix (fd, this); + + afr_do_discard(frame, this); + + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + } + + return 0; +} + + +/* {{{ zerofill */ + +static int +afr_zerofill_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (zerofill, main_frame, local->op_ret, + local->op_errno, + &local->cont.zerofill.prebuf, + &local->cont.zerofill.postbuf, + NULL); + } + return 0; +} + +static int +afr_zerofill_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + int read_child = 0; + + local = frame->local; + priv = this->private; + + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + + LOCK (&frame->lock); + { + if (child_index == read_child) { + local->read_child_returned = _gf_true; + } + + if (afr_fop_failed (op_ret, op_errno)) { + afr_transaction_fop_failed (frame, this, child_index); + } + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.zerofill.prebuf = *prebuf; + local->cont.zerofill.postbuf = *postbuf; + } + + if (child_index == read_child) { + local->cont.zerofill.prebuf = *prebuf; + local->cont.zerofill.postbuf = *postbuf; + } + + local->success_count++; + + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; + } + } + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) { + local->transaction.unwind (frame, this); + } + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + +static int +afr_zerofill_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->zerofill, + local->fd, + local->cont.zerofill.offset, + local->cont.zerofill.len, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + +static int +afr_zerofill_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + +static int +afr_do_zerofill(call_frame_t *frame, xlator_t *this) +{ + call_frame_t *transaction_frame = NULL; + afr_local_t *local = NULL; + int op_ret = -1; + int op_errno = 0; + + local = frame->local; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + goto out; + } + + transaction_frame->local = local; + frame->local = NULL; + + local->op = GF_FOP_ZEROFILL; + + local->transaction.fop = afr_zerofill_wind; + local->transaction.done = afr_zerofill_done; + local->transaction.unwind = afr_zerofill_unwind; + + local->transaction.main_frame = frame; + + local->transaction.start = local->cont.zerofill.offset; + local->transaction.len = 0; + + op_ret = afr_transaction (transaction_frame, this, + AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + op_ret = 0; +out: + if (op_ret < 0) { + if (transaction_frame) { + AFR_STACK_DESTROY (transaction_frame); + } + AFR_STACK_UNWIND (zerofill, frame, op_ret, op_errno, NULL, + NULL, NULL); + } + + return 0; +} + +int +afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + QUORUM_CHECK(zerofill, out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) { + goto out; + } + local->cont.zerofill.offset = offset; + local->cont.zerofill.len = len; + + local->fd = fd_ref (fd); + + afr_open_fd_fix (fd, this); + + afr_do_zerofill(frame, this); + + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) { + AFR_STACK_DESTROY (transaction_frame); + } + AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, + NULL, NULL); + } + + return 0; +} + +/* }}} */ + + diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h index f9aa7bd36..8e93ca44a 100644 --- a/xlators/cluster/afr/src/afr-inode-write.h +++ b/xlators/cluster/afr/src/afr-inode-write.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __INODE_WRITE_H__ @@ -22,51 +13,70 @@ int32_t afr_chmod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode); + loc_t *loc, mode_t mode, dict_t *xdata); int32_t afr_chown (call_frame_t *frame, xlator_t *this, - loc_t *loc, uid_t uid, gid_t gid); + loc_t *loc, uid_t uid, gid_t gid, dict_t *xdata); int afr_fchown (call_frame_t *frame, xlator_t *this, - fd_t *fd, uid_t uid, gid_t gid); + fd_t *fd, uid_t uid, gid_t gid, dict_t *xdata); int32_t afr_fchmod (call_frame_t *frame, xlator_t *this, - fd_t *fd, mode_t mode); + fd_t *fd, mode_t mode, dict_t *xdata); int32_t -afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, +afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref); + uint32_t flags, struct iobref *iobref, dict_t *xdata); int32_t afr_truncate (call_frame_t *frame, xlator_t *this, - loc_t *loc, off_t offset); + loc_t *loc, off_t offset, dict_t *xdata); int32_t afr_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset); + fd_t *fd, off_t offset, dict_t *xdata); int32_t afr_utimens (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct timespec tv[2]); + loc_t *loc, struct timespec tv[2], dict_t *xdata); int afr_setattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct iatt *buf, int32_t valid); + loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata); int afr_fsetattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iatt *buf, int32_t valid); + fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata); int32_t afr_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags); + loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata); + +int32_t +afr_fsetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata); int32_t afr_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name); + loc_t *loc, const char *name, dict_t *xdata); + +int32_t +afr_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata); +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata); + +int +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata); + +int +afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata); #endif /* __INODE_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index 33ddb9db1..060d78f35 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include "dict.h" @@ -31,8 +22,69 @@ #define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path */ #define LOCKED_LOWER 0x2 /* for lower path */ +#define AFR_TRACE_INODELK_IN(frame, this, params ...) \ + do { \ + afr_private_t *_priv = this->private; \ + if (!_priv->inodelk_trace) \ + break; \ + afr_trace_inodelk_in (frame, this, params); \ + } while (0); + +#define AFR_TRACE_INODELK_OUT(frame, this, params ...) \ + do { \ + afr_private_t *_priv = this->private; \ + if (!_priv->inodelk_trace) \ + break; \ + afr_trace_inodelk_out (frame, this, params); \ + } while (0); + +#define AFR_TRACE_ENTRYLK_IN(frame, this, params ...) \ + do { \ + afr_private_t *_priv = this->private; \ + if (!_priv->entrylk_trace) \ + break; \ + afr_trace_entrylk_in (frame, this, params); \ + } while (0); + +#define AFR_TRACE_ENTRYLK_OUT(frame, this, params ...) \ + do { \ + afr_private_t *_priv = this->private; \ + if (!_priv->entrylk_trace) \ + break; \ + afr_trace_entrylk_out (frame, this, params); \ + } while (0); + int -afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index); +afr_entry_lockee_cmp (const void *l1, const void *l2) +{ + const afr_entry_lockee_t *r1 = l1; + const afr_entry_lockee_t *r2 = l2; + int ret = 0; + uuid_t gfid1 = {0}; + uuid_t gfid2 = {0}; + + loc_gfid ((loc_t*)&r1->loc, gfid1); + loc_gfid ((loc_t*)&r2->loc, gfid2); + ret = uuid_compare (gfid1, gfid2); + /*Entrylks with NULL basename are the 'smallest'*/ + if (ret == 0) { + if (!r1->basename) + return -1; + if (!r2->basename) + return 1; + ret = strcmp (r1->basename, r2->basename); + } + + if (ret <= 0) + return -1; + else + return 1; +} + +int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index); + +static int +afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this); static uint64_t afr_lock_number = 1; @@ -57,12 +109,13 @@ afr_set_lock_number (call_frame_t *frame, xlator_t *this) } void -afr_set_lk_owner (call_frame_t *frame, xlator_t *this) +afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner) { gf_log (this->name, GF_LOG_TRACE, "Setting lk-owner=%llu", - (unsigned long long) (unsigned long)frame->root); - frame->root->lk_owner = (uint64_t) (unsigned long)frame->root; + (unsigned long long) (unsigned long)lk_owner); + + set_lk_owner_from_ptr (&frame->root->lk_owner, lk_owner); } static int @@ -98,16 +151,9 @@ internal_lock_count (call_frame_t *frame, xlator_t *this) local = frame->local; priv = this->private; - if (local->fd) { - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && local->fd_open_on[i]) - ++call_count; - } - } else { - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) - ++call_count; - } + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) + ++call_count; } return call_count; @@ -115,7 +161,7 @@ internal_lock_count (call_frame_t *frame, xlator_t *this) static void afr_print_inodelk (char *str, int size, int cmd, - struct gf_flock *flock, uint64_t owner) + struct gf_flock *flock, gf_lkowner_t *owner) { char *cmd_str = NULL; char *type_str = NULL; @@ -163,11 +209,11 @@ afr_print_inodelk (char *str, int size, int cmd, } snprintf (str, size, "lock=INODELK, cmd=%s, type=%s, " - "start=%llu, len=%llu, pid=%llu, lk-owner=%llu", + "start=%llu, len=%llu, pid=%llu, lk-owner=%s", cmd_str, type_str, (unsigned long long) flock->l_start, (unsigned long long) flock->l_len, (unsigned long long) flock->l_pid, - (unsigned long long) owner); + lkowner_utoa (owner)); } @@ -183,11 +229,11 @@ afr_print_lockee (char *str, int size, loc_t *loc, fd_t *fd, void afr_print_entrylk (char *str, int size, const char *basename, - uint64_t owner) + gf_lkowner_t *owner) { - snprintf (str, size, "Basename=%s, lk-owner=%llu", + snprintf (str, size, "Basename=%s, lk-owner=%s", basename ? basename : "<nul>", - (unsigned long long)owner); + lkowner_utoa (owner)); } static void @@ -241,27 +287,20 @@ afr_set_lock_call_type (afr_lock_call_type_t lock_call_type, } static void -afr_trace_inodelk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type, +afr_trace_inodelk_out (call_frame_t *frame, xlator_t *this, + afr_lock_call_type_t lock_call_type, afr_lock_op_type_t lk_op_type, struct gf_flock *flock, int op_ret, int op_errno, int32_t child_index) { - xlator_t *this = NULL; afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; - afr_private_t *priv = NULL; char lockee[256]; char lock_call_type_str[256]; char verdict[16]; - this = THIS; local = frame->local; int_lock = &local->internal_lock; - priv = this->private; - - if (!priv->inodelk_trace) { - return; - } afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); @@ -270,39 +309,31 @@ afr_trace_inodelk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type, afr_print_verdict (op_ret, op_errno, verdict); gf_log (this->name, GF_LOG_INFO, - "[%s %s] [%s] Lockee={%s} Number={%llu}", + "[%s %s] [%s] lk-owner=%s Lockee={%s} Number={%llu}", lock_call_type_str, lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", - verdict, - lockee, + verdict, lkowner_utoa (&frame->root->lk_owner), lockee, (unsigned long long) int_lock->lock_number); } static void -afr_trace_inodelk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type, +afr_trace_inodelk_in (call_frame_t *frame, xlator_t *this, + afr_lock_call_type_t lock_call_type, afr_lock_op_type_t lk_op_type, struct gf_flock *flock, int32_t cmd, int32_t child_index) { - xlator_t *this = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; char lock[256]; char lockee[256]; char lock_call_type_str[256]; - this = THIS; local = frame->local; int_lock = &local->internal_lock; - priv = this->private; - if (!priv->inodelk_trace) { - return; - } - - afr_print_inodelk (lock, 256, cmd, flock, frame->root->lk_owner); + afr_print_inodelk (lock, 256, cmd, flock, &frame->root->lk_owner); afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); @@ -317,20 +348,21 @@ afr_trace_inodelk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type, } static void -afr_trace_entrylk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type, +afr_trace_entrylk_in (call_frame_t *frame, xlator_t *this, + afr_lock_call_type_t lock_call_type, afr_lock_op_type_t lk_op_type, const char *basename, - int32_t child_index) + int32_t cookie) { - xlator_t *this = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; afr_private_t *priv = NULL; + int child_index = 0; + int lockee_no = 0; char lock[256]; char lockee[256]; char lock_call_type_str[256]; - this = THIS; local = frame->local; int_lock = &local->internal_lock; priv = this->private; @@ -338,36 +370,41 @@ afr_trace_entrylk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type, if (!priv->entrylk_trace) { return; } + lockee_no = cookie / priv->child_count; + child_index = cookie % priv->child_count; - afr_print_entrylk (lock, 256, basename, frame->root->lk_owner); - afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); + afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); + afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, + child_index); afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); gf_log (this->name, GF_LOG_INFO, - "[%s %s] Lock={%s} Lockee={%s} Number={%llu}", + "[%s %s] Lock={%s} Lockee={%s} Number={%llu}, Cookie={%d}", lock_call_type_str, lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST", lock, lockee, - (unsigned long long) int_lock->lock_number); + (unsigned long long) int_lock->lock_number, + cookie); } static void -afr_trace_entrylk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type, - afr_lock_op_type_t lk_op_type, const char *basename, int op_ret, - int op_errno, int32_t child_index) +afr_trace_entrylk_out (call_frame_t *frame, xlator_t *this, + afr_lock_call_type_t lock_call_type, + afr_lock_op_type_t lk_op_type, const char *basename, + int op_ret, int op_errno, int32_t cookie) { - xlator_t *this = NULL; afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; + int lockee_no = 0; + int child_index = 0; char lock[256]; char lockee[256]; char lock_call_type_str[256]; char verdict[16]; - this = THIS; local = frame->local; int_lock = &local->internal_lock; priv = this->private; @@ -375,20 +412,25 @@ afr_trace_entrylk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type, if (!priv->entrylk_trace) { return; } + lockee_no = cookie / priv->child_count; + child_index = cookie % priv->child_count; - afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); + afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); + afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, + child_index); afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); afr_print_verdict (op_ret, op_errno, verdict); gf_log (this->name, GF_LOG_INFO, - "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu}", + "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu} Cookie={%d}", lock_call_type_str, lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", verdict, lock, lockee, - (unsigned long long) int_lock->lock_number); + (unsigned long long) int_lock->lock_number, + cookie); } @@ -441,6 +483,47 @@ is_afr_lock_transaction (afr_local_t *local) return ret; } +int +afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local, + loc_t *loc, char *basename, int child_count) +{ + int ret = -1; + + loc_copy (&lockee->loc, loc); + lockee->basename = (basename)? gf_strdup (basename): NULL; + if (basename && !lockee->basename) + goto out; + + lockee->locked_count = 0; + lockee->locked_nodes = GF_CALLOC (child_count, + sizeof (*lockee->locked_nodes), + gf_afr_mt_afr_node_character); + + if (!lockee->locked_nodes) + goto out; + + ret = 0; +out: + return ret; + +} + +void +afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock) +{ + int i = 0; + + for (i = 0; i < int_lock->lockee_count; i++) { + loc_wipe (&int_lock->lockee[i].loc); + if (int_lock->lockee[i].basename) + GF_FREE (int_lock->lockee[i].basename); + if (int_lock->lockee[i].locked_nodes) + GF_FREE (int_lock->lockee[i].locked_nodes); + } + + return; +} + static int initialize_entrylk_variables (call_frame_t *frame, xlator_t *this) { @@ -458,8 +541,13 @@ initialize_entrylk_variables (call_frame_t *frame, xlator_t *this) int_lock->lock_op_ret = -1; int_lock->lock_op_errno = 0; - for (i = 0; i < priv->child_count; i++) { - int_lock->entry_locked_nodes[i] = 0; + for (i = 0; i < AFR_LOCKEE_COUNT_MAX; i++) { + if (!int_lock->lockee[i].locked_nodes) + break; + int_lock->lockee[i].locked_count = 0; + memset (int_lock->lockee[i].locked_nodes, 0, + sizeof (*int_lock->lockee[i].locked_nodes) * + priv->child_count); } return 0; @@ -471,19 +559,23 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; afr_private_t *priv = NULL; - int i = 0; + afr_inodelk_t *inodelk = NULL; priv = this->private; local = frame->local; int_lock = &local->internal_lock; - int_lock->inodelk_lock_count = 0; - int_lock->lock_op_ret = -1; - int_lock->lock_op_errno = 0; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - for (i = 0; i < priv->child_count; i++) { - int_lock->inode_locked_nodes[i] = 0; - } + inodelk->lock_count = 0; + int_lock->lk_attempted_count = 0; + int_lock->lock_op_ret = -1; + int_lock->lock_op_errno = 0; + + memset (inodelk->locked_nodes, 0, + sizeof (*inodelk->locked_nodes) * priv->child_count); + memset (int_lock->locked_nodes, 0, + sizeof (*int_lock->locked_nodes) * priv->child_count); return 0; } @@ -493,7 +585,7 @@ lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2) { int ret = 0; - ret = strcmp (l1->path, l2->path); + ret = uuid_compare (l1->inode->gfid, l2->inode->gfid); if (ret == 0) ret = strcmp (b1, b2); @@ -505,6 +597,18 @@ lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2) } int +afr_lockee_locked_nodes_count (afr_internal_lock_t *int_lock) +{ + int call_count = 0; + int i = 0; + + for (i = 0; i < int_lock->lockee_count; i++) + call_count += int_lock->lockee[i].locked_count; + + return call_count; +} + +int afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) { @@ -522,7 +626,7 @@ afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) /* FIXME: What if UNLOCK fails */ static int32_t afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; @@ -548,33 +652,37 @@ afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this, static int32_t afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; int32_t child_index = (long)cookie; + afr_private_t *priv = NULL; local = frame->local; int_lock = &local->internal_lock; - afr_trace_inodelk_out (frame, AFR_INODELK_TRANSACTION, + AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION, AFR_UNLOCK_OP, NULL, op_ret, op_errno, child_index); + priv = this->private; + if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { - gf_log (this->name, GF_LOG_ERROR, - "%s: unlock failed on %d, reason: %s", - local->loc.path, child_index, strerror (op_errno)); + gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on subvolume %s " + "with lock owner %s", local->loc.path, + priv->children[child_index]->name, + lkowner_utoa (&frame->root->lk_owner)); } - int_lock->inode_locked_nodes[child_index] &= LOCKED_NO; - - if (op_ret == 1) { + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + inodelk->locked_nodes[child_index] &= LOCKED_NO; + if (local->transaction.eager_lock) local->transaction.eager_lock[child_index] = 0; - } - afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno); + afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, xdata); return 0; @@ -584,9 +692,12 @@ static int afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; struct gf_flock flock = {0,}; + struct gf_flock full_flock = {0,}; + struct gf_flock *flock_use = NULL; int call_count = 0; int i = 0; int piggyback = 0; @@ -597,15 +708,14 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; priv = this->private; - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; - flock.l_type = F_UNLCK; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - gf_log (this->name, GF_LOG_DEBUG, "attempting data unlock range %"PRIu64 - " %"PRIu64" by %"PRIu64, flock.l_start, flock.l_len, - frame->root->lk_owner); + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; + flock.l_type = F_UNLCK; - call_count = afr_locked_nodes_count (int_lock->inode_locked_nodes, + full_flock.l_type = F_UNLCK; + call_count = afr_locked_nodes_count (inodelk->locked_nodes, priv->child_count); int_lock->lk_call_count = call_count; @@ -621,11 +731,11 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) fd_ctx = afr_fd_ctx_get (local->fd, this); for (i = 0; i < priv->child_count; i++) { - if ((int_lock->inode_locked_nodes[i] & LOCKED_YES) - != LOCKED_YES) + if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES) continue; if (local->fd) { + flock_use = &flock; if (!local->transaction.eager_lock[i]) { goto wind; } @@ -637,43 +747,48 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) if (fd_ctx->lock_piggyback[i]) { fd_ctx->lock_piggyback[i]--; piggyback = 1; + } else { + fd_ctx->lock_acquired[i]--; } } UNLOCK (&local->fd->lock); if (piggyback) { afr_unlock_inodelk_cbk (frame, (void *) (long) i, - this, 1, 0); + this, 1, 0, NULL); if (!--call_count) break; continue; } - fd_ctx->lock_acquired[i]--; + flock_use = &full_flock; wind: - afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION, - AFR_UNLOCK_OP, &flock, F_SETLK, i); + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_TRANSACTION, + AFR_UNLOCK_OP, flock_use, F_SETLK, + i); STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, (void *) (long)i, priv->children[i], priv->children[i]->fops->finodelk, - this->name, local->fd, - F_SETLK, &flock); + int_lock->domain, local->fd, + F_SETLK, flock_use, NULL); if (!--call_count) break; } else { - afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION, + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_TRANSACTION, AFR_UNLOCK_OP, &flock, F_SETLK, i); STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, (void *) (long)i, priv->children[i], priv->children[i]->fops->inodelk, - this->name, &local->loc, - F_SETLK, &flock); + int_lock->domain, &local->loc, + F_SETLK, &flock, NULL); if (!--call_count) break; @@ -685,24 +800,34 @@ out: static int32_t afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; - int32_t child_index = (long)cookie; + afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; + int32_t child_index = 0; + int lockee_no = 0; + + priv = this->private; + lockee_no = (int)((long) cookie) / priv->child_count; + child_index = (int) ((long) cookie) % priv->child_count; local = frame->local; + int_lock = &local->internal_lock; - afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION, - AFR_UNLOCK_OP, NULL, op_ret, - op_errno, child_index); + AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, + AFR_UNLOCK_OP, + int_lock->lockee[lockee_no].basename, op_ret, + op_errno, (int) ((long)cookie)); - if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { + if (op_ret < 0) { gf_log (this->name, GF_LOG_ERROR, "%s: unlock failed on %d, reason: %s", local->loc.path, child_index, strerror (op_errno)); } - afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno); + int_lock->lockee[lockee_no].locked_nodes[child_index] &= LOCKED_NO; + afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, NULL); return 0; } @@ -710,24 +835,22 @@ afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, static int afr_unlock_entrylk (call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - const char *basename = NULL; - loc_t *loc = NULL; - int call_count = 0; - int i = -1; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int index = 0; + int lockee_no = 0; + int copies = 0; + int i = -1; local = frame->local; int_lock = &local->internal_lock; priv = this->private; + copies = priv->child_count; - basename = int_lock->lk_basename; - if (int_lock->lk_loc) - loc = int_lock->lk_loc; + call_count = afr_lockee_locked_nodes_count (int_lock); - call_count = afr_locked_nodes_count (int_lock->entry_locked_nodes, - priv->child_count); int_lock->lk_call_count = call_count; if (!call_count){ @@ -737,18 +860,23 @@ afr_unlock_entrylk (call_frame_t *frame, xlator_t *this) goto out; } - for (i = 0; i < priv->child_count; i++) { - if (int_lock->entry_locked_nodes[i] & LOCKED_YES) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION, - AFR_UNLOCK_OP, basename, i); + for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) { + lockee_no = i / copies; + index = i % copies; + if (int_lock->lockee[lockee_no].locked_nodes[index] & LOCKED_YES) { + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, + AFR_UNLOCK_OP, + int_lock->lockee[lockee_no].basename, + i); STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - loc, basename, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + priv->children[index], + priv->children[index]->fops->entrylk, + int_lock->domain, + &int_lock->lockee[lockee_no].loc, + int_lock->lockee[lockee_no].basename, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); if (!--call_count) break; @@ -762,15 +890,22 @@ out: static int32_t afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - int child_index = (long) cookie; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int cky = (long) cookie; + int child_index = 0; + int lockee_no = 0; + priv = this->private; local = frame->local; int_lock = &local->internal_lock; + child_index = ((int)cky) % priv->child_count; + lockee_no = ((int)cky) / priv->child_count; + LOCK (&frame->lock); { if (op_ret == -1) { @@ -786,6 +921,8 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_errno = op_errno; int_lock->lock_op_errno = op_errno; } + + int_lock->lk_attempted_count++; } UNLOCK (&frame->lock); @@ -794,10 +931,17 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_unlock (frame, this); } else { if (op_ret == 0) { - int_lock->locked_nodes[child_index] |= LOCKED_YES; - int_lock->lock_count++; + if (local->transaction.type == AFR_ENTRY_TRANSACTION || + local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + int_lock->lockee[lockee_no].locked_nodes[child_index] |= LOCKED_YES; + int_lock->lockee[lockee_no].locked_count++; + int_lock->entrylk_lock_count++; + } else { + int_lock->locked_nodes[child_index] |= LOCKED_YES; + int_lock->lock_count++; + } } - afr_lock_blocking (frame, this, child_index + 1); + afr_lock_blocking (frame, this, cky + 1); } return 0; @@ -805,98 +949,26 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, static int32_t afr_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_trace_inodelk_out (frame, AFR_INODELK_TRANSACTION, + AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION, AFR_LOCK_OP, NULL, op_ret, op_errno, (long) cookie); - afr_lock_cbk (frame, cookie, this, op_ret, op_errno); + afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); return 0; } static int32_t -afr_lock_lower_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - loc_t *lower = NULL; - loc_t *higher = NULL; - const char *higher_name = NULL; - int child_index = (long) cookie; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ - - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/locks xlator on server"); - - local->op_ret = op_ret; - } - - local->op_errno = op_errno; - } - } - UNLOCK (&frame->lock); - - if (op_ret != 0) { - afr_unlock (frame, this); - goto out; - } else { - int_lock->lower_locked_nodes[child_index] |= LOCKED_LOWER; - int_lock->lock_count++; - } - - /* The lower path has been locked. Now lock the higher path */ - - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, higher_name, child_index); - - - STACK_WIND_COOKIE (frame, afr_lock_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - this->name, higher, higher_name, - ENTRYLK_LOCK, ENTRYLK_WRLCK); - -out: - return 0; -} - -static int32_t afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION, + AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, AFR_LOCK_OP, NULL, op_ret, op_errno, (long)cookie); - afr_lock_cbk (frame, cookie, this, op_ret, op_errno); + afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); return 0; } @@ -904,6 +976,7 @@ static int afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; @@ -914,18 +987,16 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) switch (local->transaction.type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - memcpy (int_lock->inode_locked_nodes, - int_lock->locked_nodes, - priv->child_count); - int_lock->inodelk_lock_count = int_lock->lock_count; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + memcpy (inodelk->locked_nodes, int_lock->locked_nodes, + sizeof (*inodelk->locked_nodes) * priv->child_count); + inodelk->lock_count = int_lock->lock_count; break; case AFR_ENTRY_RENAME_TRANSACTION: case AFR_ENTRY_TRANSACTION: - memcpy (int_lock->entry_locked_nodes, - int_lock->locked_nodes, - priv->child_count); - int_lock->entrylk_lock_count = int_lock->lock_count; + /*entrylk_count is being used in both non-blocking and blocking + * modes */ break; } @@ -933,25 +1004,67 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) } +static inline gf_boolean_t +afr_is_entrylk (afr_internal_lock_t *int_lock, + afr_transaction_type trans_type) +{ + gf_boolean_t is_entrylk = _gf_false; + + if ((int_lock->transaction_lk_type == AFR_SELFHEAL_LK) && + int_lock->selfheal_lk_type == AFR_ENTRY_SELF_HEAL_LK) { + + is_entrylk = _gf_true; + + } else if ((int_lock->transaction_lk_type == AFR_TRANSACTION_LK) && + (trans_type == AFR_ENTRY_TRANSACTION || + trans_type == AFR_ENTRY_RENAME_TRANSACTION)) { + + is_entrylk = _gf_true; + + } else { + is_entrylk = _gf_false; + } + + return is_entrylk; +} + +static gf_boolean_t +_is_lock_wind_needed (afr_local_t *local, int child_index) +{ + if (!local->child_up[child_index]) + return _gf_false; + + return _gf_true; +} + int -afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) +afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; - loc_t *lower = NULL; - const char *lower_name = NULL; struct gf_flock flock = {0,}; uint64_t ctx = 0; int ret = 0; + int child_index = 0; + int lockee_no = 0; + gf_boolean_t is_entrylk = _gf_false; + + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + child_index = cookie % priv->child_count; + lockee_no = cookie / priv->child_count; + is_entrylk = afr_is_entrylk (int_lock, local->transaction.type); - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; - flock.l_type = int_lock->lk_flock.l_type; + if (!is_entrylk) { + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; + flock.l_type = inodelk->flock.l_type; + } if (local->fd) { ret = fd_ctx_get (local->fd, this, &ctx); @@ -970,42 +1083,26 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) return 0; } - - /* skip over children that or down - or don't have the fd open */ - - while ((child_index < priv->child_count) - && (!local->child_up[child_index] || - !local->fd_open_on[child_index])) - - child_index++; - } else { - /* skip over children that are down */ - while ((child_index < priv->child_count) - && !local->child_up[child_index]) - child_index++; } - if ((child_index == priv->child_count) && - int_lock->lock_count == 0) { - - gf_log (this->name, GF_LOG_INFO, - "unable to lock on even one child"); - - local->op_ret = -1; - int_lock->lock_op_ret = -1; + if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { + if ((is_entrylk && int_lock->entrylk_lock_count == 0) || + (!is_entrylk && int_lock->lock_count == 0)) { + gf_log (this->name, GF_LOG_INFO, + "unable to lock on even one child"); - afr_copy_locked_nodes (frame, this); + local->op_ret = -1; + int_lock->lock_op_ret = -1; - afr_unlock(frame, this); + afr_copy_locked_nodes (frame, this); - return 0; + afr_unlock(frame, this); + return 0; + } } - if ((child_index == priv->child_count) - || (int_lock->lock_count == int_lock->lk_expected_count)) { - + if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { /* we're done locking */ gf_log (this->name, GF_LOG_DEBUG, @@ -1018,12 +1115,18 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) return 0; } + if (!_is_lock_wind_needed (local, child_index)) { + afr_lock_blocking (frame, this, cookie + 1); + return 0; + } + switch (local->transaction.type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: if (local->fd) { - afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION, + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_TRANSACTION, AFR_LOCK_OP, &flock, F_SETLKW, child_index); @@ -1031,11 +1134,12 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->finodelk, - this->name, local->fd, - F_SETLKW, &flock); + int_lock->domain, local->fd, + F_SETLKW, &flock, NULL); } else { - afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION, + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_TRANSACTION, AFR_LOCK_OP, &flock, F_SETLKW, child_index); @@ -1043,63 +1147,44 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->inodelk, - this->name, &local->loc, - F_SETLKW, &flock); + int_lock->domain, &local->loc, + F_SETLKW, &flock, NULL); } break; case AFR_ENTRY_RENAME_TRANSACTION: - { - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, lower_name, child_index); - - - STACK_WIND_COOKIE (frame, afr_lock_lower_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - this->name, lower, lower_name, - ENTRYLK_LOCK, ENTRYLK_WRLCK); - - break; - } - case AFR_ENTRY_TRANSACTION: + /*Accounting for child_index increments on 'down' + *and 'fd-less' children */ + if (local->fd) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, local->transaction.basename, - child_index); + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, + cookie); STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, - (void *) (long) child_index, + (void *) (long) cookie, priv->children[child_index], priv->children[child_index]->fops->fentrylk, - this->name, local->fd, - local->transaction.basename, - ENTRYLK_LOCK, ENTRYLK_WRLCK); + int_lock->domain, local->fd, + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); } else { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION, + AFR_TRACE_ENTRYLK_IN (frame, this, + AFR_ENTRYLK_TRANSACTION, AFR_LOCK_OP, local->transaction.basename, child_index); STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, - (void *) (long) child_index, + (void *) (long) cookie, priv->children[child_index], priv->children[child_index]->fops->entrylk, - this->name, - &local->transaction.parent_loc, - local->transaction.basename, - ENTRYLK_LOCK, ENTRYLK_WRLCK); + int_lock->domain, + &int_lock->lockee[lockee_no].loc, + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); } break; @@ -1127,11 +1212,12 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this) break; case AFR_ENTRY_RENAME_TRANSACTION: + case AFR_ENTRY_TRANSACTION: up_count = afr_up_children_count (local->child_up, priv->child_count); - int_lock->lk_expected_count = 2 * up_count; - //fallthrough - case AFR_ENTRY_TRANSACTION: + int_lock->lk_call_count = int_lock->lk_expected_count + = (int_lock->lockee_count * + up_count); initialize_entrylk_variables (frame, this); break; } @@ -1143,47 +1229,60 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this) static int32_t afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; int call_count = 0; int child_index = (long) cookie; + int copies = 0; + int index = 0; + int lockee_no = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + copies = priv->child_count; + index = child_index % copies; + lockee_no = child_index / copies; local = frame->local; int_lock = &local->internal_lock; - afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, NULL, op_ret, + AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, op_ret, op_errno, (long) cookie); - LOCK (&frame->lock); - { - call_count = --int_lock->lk_call_count; - } - UNLOCK (&frame->lock); - - if (op_ret < 0 ) { - if (op_errno == ENOSYS) { + LOCK (&frame->lock); + { + if (op_ret < 0 ) { + if (op_errno == ENOSYS) { /* return ENOTSUP */ - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/locks xlator on server"); - local->op_ret = op_ret; - int_lock->lock_op_ret = op_ret; + gf_log (this->name, GF_LOG_ERROR, + "subvolume does not support locking. " + "please load features/locks xlator on server"); + local->op_ret = op_ret; + int_lock->lock_op_ret = op_ret; + + int_lock->lock_op_errno = op_errno; + local->op_errno = op_errno; + } + } else if (op_ret == 0) { + int_lock->lockee[lockee_no].locked_nodes[index] |= \ + LOCKED_YES; + int_lock->lockee[lockee_no].locked_count++; + int_lock->entrylk_lock_count++; + } - int_lock->lock_op_errno = op_errno; - local->op_errno = op_errno; - } - } else if (op_ret == 0) { - int_lock->entry_locked_nodes[child_index] |= LOCKED_YES; - int_lock->entrylk_lock_count++; + call_count = --int_lock->lk_call_count; } + UNLOCK (&frame->lock); if (call_count == 0) { gf_log (this->name, GF_LOG_TRACE, "Last locking reply received"); - /* all locks successfull. Proceed to call FOP */ + /* all locks successful. Proceed to call FOP */ if (int_lock->entrylk_lock_count == int_lock->lk_expected_count) { gf_log (this->name, GF_LOG_TRACE, @@ -1191,7 +1290,7 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int_lock->lock_op_ret = 0; int_lock->lock_cbk (frame, this); } - /* Not all locks were successfull. Unlock and try locking + /* Not all locks were successful. Unlock and try locking again, this time with serially blocking locks */ else { gf_log (this->name, GF_LOG_TRACE, @@ -1205,42 +1304,26 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, return 0; } -void -afr_mark_fd_open_on (afr_local_t *local, afr_fd_ctx_t *fd_ctx, - size_t child_count) -{ - int i = 0; - - GF_ASSERT (local->fd_open_on); - - memset (local->fd_open_on, 0, sizeof (*local->fd_open_on)*child_count); - for (i = 0; i < child_count; i++) - if (fd_ctx->opened_on[i] == AFR_FD_OPENED) - local->fd_open_on[i] = 1; -} - int afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - const char *basename = NULL; - loc_t *loc = NULL; - int32_t call_count = 0; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int copies = 0; + int index = 0; + int lockee_no = 0; + int32_t call_count = 0; int i = 0; local = frame->local; int_lock = &local->internal_lock; priv = this->private; + copies = priv->child_count; initialize_entrylk_variables (frame, this); - basename = int_lock->lk_basename; - if (int_lock->lk_loc) - loc = int_lock->lk_loc; - if (local->fd) { fd_ctx = afr_fd_ctx_get (local->fd, this); if (!fd_ctx) { @@ -1253,11 +1336,11 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) local->op_errno = EINVAL; int_lock->lock_op_errno = EINVAL; + afr_unlock (frame, this); return -1; } - afr_mark_fd_open_on (local, fd_ctx, priv->child_count); - call_count = internal_lock_count (frame, this); + call_count = int_lock->lockee_count * internal_lock_count (frame, this); int_lock->lk_call_count = call_count; int_lock->lk_expected_count = call_count; @@ -1270,42 +1353,52 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) /* Send non-blocking entrylk calls only on up children and where the fd has been opened */ - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && local->fd_open_on[i]) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION, - AFR_LOCK_OP, basename, i); + for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) { + index = i%copies; + lockee_no = i/copies; + if (local->child_up[index]) { + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, + i); STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fentrylk, + priv->children[index], + priv->children[index]->fops->fentrylk, this->name, local->fd, - basename, - ENTRYLK_LOCK_NB, ENTRYLK_WRLCK); + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, + NULL); + if (!--call_count) + break; } } } else { - GF_ASSERT (loc); - - call_count = internal_lock_count (frame, this); + call_count = int_lock->lockee_count * internal_lock_count (frame, this); int_lock->lk_call_count = call_count; int_lock->lk_expected_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION, - AFR_LOCK_OP, basename, i); + for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) { + index = i%copies; + lockee_no = i/copies; + if (local->child_up[index]) { + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, + i); STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, loc, basename, - ENTRYLK_LOCK_NB, ENTRYLK_WRLCK); + priv->children[index], + priv->children[index]->fops->entrylk, + this->name, &int_lock->lockee[lockee_no].loc, + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, + NULL); if (!--call_count) break; - } } } @@ -1315,76 +1408,75 @@ out: int32_t afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; int call_count = 0; int child_index = (long) cookie; afr_fd_ctx_t *fd_ctx = NULL; - afr_private_t *priv = NULL; - priv = this->private; local = frame->local; int_lock = &local->internal_lock; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - afr_trace_inodelk_out (frame, AFR_INODELK_NB_TRANSACTION, + AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_NB_TRANSACTION, AFR_LOCK_OP, NULL, op_ret, op_errno, (long) cookie); + if (local->fd) + fd_ctx = afr_fd_ctx_get (local->fd, this); + LOCK (&frame->lock); { + if (op_ret < 0) { + if (op_errno == ENOSYS) { + /* return ENOTSUP */ + gf_log (this->name, GF_LOG_ERROR, + "subvolume does not support locking. " + "please load features/locks xlator on " + "server"); + local->op_ret = op_ret; + int_lock->lock_op_ret = op_ret; + int_lock->lock_op_errno = op_errno; + local->op_errno = op_errno; + } + if (local->transaction.eager_lock) + local->transaction.eager_lock[child_index] = 0; + } else { + inodelk->locked_nodes[child_index] |= LOCKED_YES; + inodelk->lock_count++; + + if (local->transaction.eager_lock && + local->transaction.eager_lock[child_index] && + local->fd) { + /* piggybacked */ + if (op_ret == 1) { + /* piggybacked */ + } else if (op_ret == 0) { + /* lock acquired from server */ + fd_ctx->lock_acquired[child_index]++; + } + } + } + call_count = --int_lock->lk_call_count; } UNLOCK (&frame->lock); - if (op_ret < 0) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/locks xlator on server"); - local->op_ret = op_ret; - int_lock->lock_op_ret = op_ret; - int_lock->lock_op_errno = op_errno; - local->op_errno = op_errno; - } - } else { - int_lock->inode_locked_nodes[child_index] - |= LOCKED_YES; - int_lock->inodelk_lock_count++; - - if (priv->eager_lock && local->fd) { - fd_ctx = afr_fd_ctx_get (local->fd, this); - local->transaction.eager_lock[child_index] = 1; - /* piggybacked */ - - if (op_ret == 1) { - /* piggybacked */ - } else if (op_ret == 0) { - /* lock acquired from server */ - LOCK (&local->fd->lock); - { - fd_ctx->lock_acquired[child_index]++; - } - UNLOCK (&local->fd->lock); - } - } - } - if (call_count == 0) { gf_log (this->name, GF_LOG_TRACE, "Last inode locking reply received"); - /* all locks successfull. Proceed to call FOP */ - if (int_lock->inodelk_lock_count == - int_lock->lk_expected_count) { + /* all locks successful. Proceed to call FOP */ + if (inodelk->lock_count == int_lock->lk_expected_count) { gf_log (this->name, GF_LOG_TRACE, "All servers locked. Calling the cbk"); int_lock->lock_op_ret = 0; int_lock->lock_cbk (frame, this); } - /* Not all locks were successfull. Unlock and try locking + /* Not all locks were successful. Unlock and try locking again, this time with serially blocking locks */ else { gf_log (this->name, GF_LOG_TRACE, @@ -1402,30 +1494,29 @@ int afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; afr_fd_ctx_t *fd_ctx = NULL; - int32_t call_count = 0; - int i = 0; - int ret = 0; - struct gf_flock flock = {0,}; - struct gf_flock full_flock = {0,}; - struct gf_flock *flock_use = &flock; - int piggyback = 0; + int32_t call_count = 0; + int i = 0; + int ret = 0; + struct gf_flock flock = {0,}; + struct gf_flock full_flock = {0,}; + struct gf_flock *flock_use = NULL; + int piggyback = 0; local = frame->local; int_lock = &local->internal_lock; priv = this->private; - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; - flock.l_type = int_lock->lk_flock.l_type; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - gf_log (this->name, GF_LOG_DEBUG, "attempting data lock range %"PRIu64 - " %"PRIu64" by %"PRIu64, flock.l_start, flock.l_len, - frame->root->lk_owner); + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; + flock.l_type = inodelk->flock.l_type; - full_flock.l_type = int_lock->lk_flock.l_type; + full_flock.l_type = inodelk->flock.l_type; initialize_inodelk_variables (frame, this); @@ -1441,11 +1532,11 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) local->op_errno = EINVAL; int_lock->lock_op_errno = EINVAL; + afr_unlock (frame, this); ret = -1; goto out; } - afr_mark_fd_open_on (local, fd_ctx, priv->child_count); call_count = internal_lock_count (frame, this); int_lock->lk_call_count = call_count; int_lock->lk_expected_count = call_count; @@ -1460,14 +1551,18 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) /* Send non-blocking inodelk calls only on up children and where the fd has been opened */ for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i] || !local->fd_open_on[i]) + if (!local->child_up[i]) continue; - if (!priv->eager_lock) + flock_use = &flock; + if (!local->transaction.eager_lock_on) { goto wind; + } - flock_use = &full_flock; piggyback = 0; + local->transaction.eager_lock[i] = 1; + + afr_set_delayed_post_op (frame, this); LOCK (&local->fd->lock); { @@ -1481,21 +1576,23 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) if (piggyback) { /* (op_ret == 1) => indicate piggybacked lock */ afr_nonblocking_inodelk_cbk (frame, (void *) (long) i, - this, 1, 0); + this, 1, 0, NULL); if (!--call_count) break; continue; } + flock_use = &full_flock; wind: - afr_trace_inodelk_in (frame, AFR_INODELK_NB_TRANSACTION, + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_NB_TRANSACTION, AFR_LOCK_OP, flock_use, F_SETLK, i); STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->finodelk, - this->name, local->fd, - F_SETLK, flock_use); + int_lock->domain, local->fd, + F_SETLK, flock_use, NULL); if (!--call_count) break; @@ -1508,15 +1605,16 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) for (i = 0; i < priv->child_count; i++) { if (!local->child_up[i]) continue; - afr_trace_inodelk_in (frame, AFR_INODELK_NB_TRANSACTION, + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_NB_TRANSACTION, AFR_LOCK_OP, &flock, F_SETLK, i); STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->inodelk, - this->name, &local->loc, - F_SETLK, &flock); + int_lock->domain, &local->loc, + F_SETLK, &flock, NULL); if (!--call_count) break; @@ -1526,200 +1624,6 @@ out: return ret; } -static int -__is_lower_locked (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int count = 0; - int i = 0; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (int_lock->lower_locked_nodes[i] & LOCKED_LOWER) - count++; - } - - return count; - -} - -static int -__is_higher_locked (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int count = 0; - int i = 0; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (int_lock->locked_nodes[i] & LOCKED_YES) - count++; - } - - return count; - -} - -static int -afr_unlock_lower_entrylk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - const char *basename = NULL; - loc_t *loc = NULL; - int call_count = 0; - int i = -1; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - basename = int_lock->lk_basename; - if (int_lock->lk_loc) - loc = int_lock->lk_loc; - - call_count = __is_lower_locked (frame, this); - int_lock->lk_call_count = call_count; - - if (!call_count){ - gf_log (this->name, GF_LOG_TRACE, - "No internal locks unlocked"); - int_lock->lock_cbk (frame, this); - goto out; - } - - for (i = 0; i < priv->child_count; i++) { - if (int_lock->lower_locked_nodes[i] & LOCKED_LOWER) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION, - AFR_UNLOCK_OP, basename, i); - - STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - loc, basename, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK); - - if (!--call_count) - break; - - } - } - -out: - return 0; - -} - - -static int -afr_post_unlock_higher_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.done (frame, this); - return 0; -} - -static int -afr_post_unlock_lower_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - loc_t *lower = NULL; - loc_t *higher = NULL; - const char *higher_name = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - if (__is_higher_locked (frame, this)) { - gf_log (this->name, GF_LOG_DEBUG, - "unlocking higher"); - int_lock->lk_basename = higher_name; - int_lock->lk_loc = higher; - int_lock->lock_cbk = afr_post_unlock_higher_cbk; - - afr_unlock_entrylk (frame, this); - } else - local->transaction.done (frame, this); - - return 0; -} - -static int -afr_rename_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - loc_t *lower = NULL; - const char *lower_name = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - if (__is_lower_locked (frame, this)) { - gf_log (this->name, GF_LOG_DEBUG, - "unlocking lower"); - int_lock->lk_basename = lower_name; - int_lock->lk_loc = lower; - int_lock->lock_cbk = afr_post_unlock_lower_cbk; - - afr_unlock_lower_entrylk (frame, this); - } else - afr_post_unlock_lower_cbk (frame, this); - - return 0; -} - -static int -afr_rename_transaction (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - return (local->transaction.type == - AFR_ENTRY_RENAME_TRANSACTION); - -} - int32_t afr_unlock (call_frame_t *frame, xlator_t *this) { @@ -1731,10 +1635,8 @@ afr_unlock (call_frame_t *frame, xlator_t *this) if (is_afr_lock_transaction (local)) afr_unlock_inodelk (frame, this); else - if (!afr_rename_transaction (frame, this)) - afr_unlock_entrylk (frame, this); - else - afr_rename_unlock (frame, this); + afr_unlock_entrylk (frame, this); + } else { if (is_afr_lock_selfheal (local)) afr_unlock_inodelk (frame, this); @@ -1903,10 +1805,12 @@ out: int32_t afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock); + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata); int32_t afr_recover_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) { afr_local_t *local = NULL; afr_private_t *priv = NULL; @@ -1930,7 +1834,7 @@ afr_recover_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, (void *) (long) source_child, priv->children[source_child], priv->children[source_child]->fops->lk, - local->fd, F_GETLK_FD, &flock); + local->fd, F_GETLK_FD, &flock, NULL); return 0; @@ -1958,7 +1862,7 @@ afr_recover_lock (call_frame_t *frame, xlator_t *this, (void *) (long) lock_recovery_child, priv->children[lock_recovery_child], priv->children[lock_recovery_child]->fops->lk, - local->fd, F_SETLK, flock); + local->fd, F_SETLK, flock, NULL); return 0; } @@ -1976,7 +1880,8 @@ is_afr_lock_eol (struct gf_flock *lock) int32_t afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) { if (op_ret) { gf_log (this->name, GF_LOG_INFO, @@ -2036,7 +1941,7 @@ afr_lock_recovery (call_frame_t *frame, xlator_t *this) (void *) (long) source_child, priv->children[source_child], priv->children[source_child]->fops->lk, - local->fd, F_GETLK_FD, &flock); + local->fd, F_GETLK_FD, &flock, NULL); out: return ret; @@ -2064,7 +1969,8 @@ out: int32_t afr_lock_recovery_preopen_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata) { int32_t child_index = (long )cookie; int ret = 0; @@ -2136,8 +2042,7 @@ afr_lock_recovery_preopen (call_frame_t *frame, xlator_t *this) (void *)(long) child_index, priv->children[child_index], priv->children[child_index]->fops->open, - &loc, fdctx->flags, local->fd, - fdctx->wbflags); + &loc, fdctx->flags, local->fd, NULL); return 0; } @@ -2165,13 +2070,14 @@ out: int afr_attempt_lock_recovery (xlator_t *this, int32_t child_index) { - call_frame_t *frame = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_locked_fd_t *locked_fd = NULL; + call_frame_t *frame = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_locked_fd_t *locked_fd = NULL; afr_locked_fd_t *tmp = NULL; - int ret = 0; + int ret = -1; struct list_head locks_list = {0,}; + int32_t op_errno = 0; priv = this->private; @@ -2185,15 +2091,10 @@ afr_attempt_lock_recovery (xlator_t *this, int32_t child_index) goto out; } - local = GF_CALLOC (1, sizeof (*local), - gf_afr_mt_afr_local_t); - if (!local) { - ret = -1; - goto out; - } - - AFR_LOCAL_INIT (local, priv); - if (!local) { + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) { ret = -1; goto out; } @@ -2231,5 +2132,43 @@ afr_attempt_lock_recovery (xlator_t *this, int32_t child_index) } out: + if ((ret < 0) && frame) + AFR_STACK_DESTROY (frame); + return ret; +} + +int +afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, + unsigned int child_count) +{ + afr_local_t *dst_local = NULL; + afr_local_t *src_local = NULL; + afr_internal_lock_t *dst_lock = NULL; + afr_internal_lock_t *src_lock = NULL; + afr_inodelk_t *dst_inodelk = NULL; + afr_inodelk_t *src_inodelk = NULL; + int ret = -1; + + src_local = src->local; + src_lock = &src_local->internal_lock; + src_inodelk = afr_get_inodelk (src_lock, dom); + dst_local = dst->local; + dst_lock = &dst_local->internal_lock; + dst_inodelk = afr_get_inodelk (dst_lock, dom); + if (!dst_inodelk || !src_inodelk) + goto out; + if (src_inodelk->locked_nodes) { + memcpy (dst_inodelk->locked_nodes, src_inodelk->locked_nodes, + sizeof (*dst_inodelk->locked_nodes) * child_count); + memset (src_inodelk->locked_nodes, 0, + sizeof (*src_inodelk->locked_nodes) * child_count); + } + + dst_lock->transaction_lk_type = src_lock->transaction_lk_type; + dst_lock->selfheal_lk_type = src_lock->selfheal_lk_type; + dst_inodelk->lock_count = src_inodelk->lock_count; + src_inodelk->lock_count = 0; + ret = 0; +out: return ret; } diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index ebe189c35..73594f265 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -26,7 +17,6 @@ enum gf_afr_mem_types_ { gf_afr_mt_iovec = gf_common_mt_end + 1, gf_afr_mt_afr_fd_ctx_t, - gf_afr_mt_afr_local_t, gf_afr_mt_afr_private_t, gf_afr_mt_int32_t, gf_afr_mt_char, @@ -44,8 +34,17 @@ enum gf_afr_mem_types_ { gf_afr_mt_locked_fd, gf_afr_mt_inode_ctx_t, gf_afr_fd_paused_call_t, - gf_afr_mt_afr_crawl_data_t, - gf_afr_mt_afr_brick_pos_t, + gf_afr_mt_crawl_data_t, + gf_afr_mt_brick_pos_t, + gf_afr_mt_shd_bool_t, + gf_afr_mt_shd_timer_t, + gf_afr_mt_shd_event_t, + gf_afr_mt_time_t, + gf_afr_mt_pos_data_t, + gf_afr_mt_reply_t, + gf_afr_mt_stats_t, + gf_afr_mt_shd_crawl_event_t, + gf_afr_mt_uint64_t, gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index 646d23ccb..643a5d692 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -123,7 +114,7 @@ out: int afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = frame->local; afr_private_t *priv = NULL; @@ -132,7 +123,7 @@ afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (afr_open_only_data_self_heal (priv->data_self_heal)) afr_perform_data_self_heal (frame, this); AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, - local->fd); + local->fd, xdata); return 0; } @@ -140,7 +131,7 @@ afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int afr_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd) + fd_t *fd, dict_t *xdata) { afr_local_t * local = NULL; int ret = 0; @@ -162,8 +153,7 @@ afr_open_cbk (call_frame_t *frame, void *cookie, local->success_count++; ret = afr_child_fd_ctx_set (this, fd, child_index, - local->cont.open.flags, - local->cont.open.wbflags); + local->cont.open.flags); if (ret) { local->op_ret = -1; local->op_errno = -ret; @@ -181,12 +171,12 @@ unlock: && (local->op_ret >= 0)) { STACK_WIND (frame, afr_open_ftruncate_cbk, this, this->fops->ftruncate, - fd, 0); + fd, 0, NULL); } else { if (afr_open_only_data_self_heal (priv->data_self_heal)) afr_perform_data_self_heal (frame, this); AFR_STACK_UNWIND (open, frame, local->op_ret, - local->op_errno, local->fd); + local->op_errno, local->fd, xdata); } } @@ -195,14 +185,13 @@ unlock: int afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, int32_t wbflags) + fd_t *fd, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; int i = 0; int ret = -1; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; int32_t wind_flags = flags & (~O_TRUNC); //We can't let truncation to happen outside transaction. @@ -214,6 +203,10 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, priv = this->private; + if (flags & (O_CREAT|O_TRUNC)) { + QUORUM_CHECK(open,out); + } + if (afr_is_split_brain (this, loc->inode)) { /* self-heal failed */ gf_log (this->name, GF_LOG_WARNING, @@ -222,20 +215,17 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - frame->local = local; call_count = local->call_count; loc_copy (&local->loc, loc); local->cont.open.flags = flags; - local->cont.open.wbflags = wbflags; local->fd = fd_ref (fd); @@ -244,86 +234,45 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->open, - loc, wind_flags, fd, wbflags); + loc, wind_flags, fd, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (open, frame, op_ret, op_errno, fd); - } + if (ret < 0) + AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, xdata); return 0; } -//NOTE: this function should be called with holding the lock on -//fd to which fd_ctx belongs -void -afr_get_resumable_calls (xlator_t *this, afr_fd_ctx_t *fd_ctx, - struct list_head *list) -{ - afr_fd_paused_call_t *paused_call = NULL; - afr_fd_paused_call_t *tmp = NULL; - afr_local_t *call_local = NULL; - afr_private_t *priv = NULL; - int i = 0; - gf_boolean_t call = _gf_false; - - priv = this->private; - list_for_each_entry_safe (paused_call, tmp, &fd_ctx->paused_calls, - call_list) { - call = _gf_true; - call_local = paused_call->frame->local; - for (i = 0; i < priv->child_count; i++) { - if (call_local->child_up[i] && - (fd_ctx->opened_on[i] == AFR_FD_OPENING)) - call = _gf_false; - } - - if (call) { - list_del_init (&paused_call->call_list); - list_add (&paused_call->call_list, list); - } - } -} - -void -afr_resume_calls (xlator_t *this, struct list_head *list) -{ - afr_fd_paused_call_t *paused_call = NULL; - afr_fd_paused_call_t *tmp = NULL; - afr_local_t *call_local = NULL; - - list_for_each_entry_safe (paused_call, tmp, list, call_list) { - list_del_init (&paused_call->call_list); - call_local = paused_call->frame->local; - call_local->fop_call_continue (paused_call->frame, this); - GF_FREE (paused_call); - } -} - int afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - int call_count = 0; - int child_index = (long) cookie; - struct list_head paused_calls = {0}; - gf_boolean_t fop_paused = _gf_false; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int call_count = 0; + int child_index = (long) cookie; priv = this->private; local = frame->local; - call_count = afr_frame_return (frame); + if (op_ret >= 0) { + gf_log (this->name, GF_LOG_DEBUG, "fd for %s opened " + "successfully on subvolume %s", local->loc.path, + priv->children[child_index]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, "Failed to open %s " + "on subvolume %s", local->loc.path, + priv->children[child_index]->name); + } - //Note: No frame locking needed for this block of code fd_ctx = afr_fd_ctx_get (local->fd, this); if (!fd_ctx) { gf_log (this->name, GF_LOG_WARNING, @@ -331,102 +280,103 @@ afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - fop_paused = local->fop_paused; LOCK (&local->fd->lock); { if (op_ret >= 0) { fd_ctx->opened_on[child_index] = AFR_FD_OPENED; - gf_log (this->name, GF_LOG_INFO, "fd for %s opened " - "successfully on subvolume %s", local->loc.path, - priv->children[child_index]->name); } else { - //Change open status from OPENING to NOT OPENED. fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; } - if (call_count == 0) { - INIT_LIST_HEAD (&paused_calls); - afr_get_resumable_calls (this, fd_ctx, &paused_calls); - } } UNLOCK (&local->fd->lock); out: - if (call_count == 0) { - afr_resume_calls (this, &paused_calls); - //If the fop is paused then resume_calls will continue the fop - if (fop_paused) - goto done; - - if (local->fop_call_continue) - local->fop_call_continue (frame, this); - else - AFR_STACK_DESTROY (frame); - } + call_count = afr_frame_return (frame); + if (call_count == 0) + AFR_STACK_DESTROY (frame); -done: return 0; } -int -afr_fix_open (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx, - int need_open_count, int *need_open) +void +afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int i = 0; - call_frame_t *open_frame = NULL; - afr_local_t *open_local = NULL; - int ret = -1; - GF_UNUSED int32_t op_errno = 0; - - GF_ASSERT (fd_ctx); - GF_ASSERT (need_open_count > 0); - GF_ASSERT (need_open); + afr_private_t *priv = NULL; + int i = 0; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + int ret = -1; + int32_t op_errno = 0; + afr_fd_ctx_t *fd_ctx = NULL; - local = frame->local; priv = this->private; - if (!local->fop_call_continue) { - open_frame = copy_frame (frame); - if (!open_frame) { - ret = -ENOMEM; - goto out; - } - ALLOC_OR_GOTO (open_local, afr_local_t, out); - open_frame->local = open_local; - ret = AFR_LOCAL_INIT (open_local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - loc_copy (&open_local->loc, &local->loc); - open_local->fd = fd_ref (local->fd); - } else { - ret = 0; - open_frame = frame; - open_local = local; + + if (!afr_is_fd_fixable (fd) || !need_open || !need_open_count) + goto out; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) { + ret = -1; + goto out; } - open_local->call_count = need_open_count; + frame = create_frame (this, this->ctx->pool); + if (!frame) { + ret = -1; + goto out; + } - gf_log (this->name, GF_LOG_DEBUG, "need open count: %d", + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + local->loc.inode = inode_ref (fd->inode); + ret = loc_path (&local->loc, NULL); + if (ret < 0) + goto out; + + local->fd = fd_ref (fd); + local->call_count = need_open_count; + + gf_log (this->name, GF_LOG_DEBUG, "need open count: %zd", need_open_count); for (i = 0; i < priv->child_count; i++) { - if (need_open[i]) { + if (!need_open[i]) + continue; + + if (IA_IFDIR == fd->inode->ia_type) { gf_log (this->name, GF_LOG_DEBUG, - "opening fd for %s on subvolume %s", + "opening fd for dir %s on subvolume %s", local->loc.path, priv->children[i]->name); - STACK_WIND_COOKIE (open_frame, afr_openfd_fix_open_cbk, + STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk, + (void*) (long) i, + priv->children[i], + priv->children[i]->fops->opendir, + &local->loc, local->fd, + NULL); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "opening fd for file %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk, (void *)(long) i, priv->children[i], priv->children[i]->fops->open, - &open_local->loc, fd_ctx->flags, - open_local->fd, fd_ctx->wbflags); - + &local->loc, + fd_ctx->flags & (~O_TRUNC), + local->fd, NULL); } + } + op_errno = 0; + ret = 0; out: - if (ret && open_frame) - AFR_STACK_DESTROY (open_frame); - return ret; + if (op_errno) + ret = -1; //For handling ALLOC_OR_GOTO + if (ret && frame) + AFR_STACK_DESTROY (frame); } diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c index 4dfb85824..83846f152 100644 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c +++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.c @@ -1,23 +1,15 @@ /* - Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ +#include <openssl/md5.h> #include "glusterfs.h" #include "afr.h" #include "xlator.h" @@ -33,7 +25,6 @@ #include "compat-errno.h" #include "compat.h" #include "byte-order.h" -#include "md5.h" #include "afr-transaction.h" #include "afr-self-heal.h" @@ -72,8 +63,7 @@ sh_private_cleanup (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; sh_priv = sh->private; - if (sh_priv) - GF_FREE (sh_priv); + GF_FREE (sh_priv); } static int @@ -104,17 +94,16 @@ sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this, local = sh_frame->local; sh = &local->self_heal; sh_priv = sh->private; - if (sh_priv) { total_blocks = sh_priv->total_blocks; diff_blocks = sh_priv->diff_blocks; } sh_private_cleanup (sh_frame, this); - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { GF_ASSERT (!last_loop_frame); //loop_finish should have happened and the old_loop should be NULL - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_DEBUG, "self-heal aborting on %s", local->loc.path); @@ -122,20 +111,17 @@ sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this, } else { GF_ASSERT (last_loop_frame); if (diff_blocks == total_blocks) { - gf_log (this->name, GF_LOG_INFO, "full self-heal " + gf_log (this->name, GF_LOG_DEBUG, "full self-heal " "completed on %s",local->loc.path); } else { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_DEBUG, "diff self-heal on %s: completed. " "(%d blocks of %d were different (%.2f%%))", local->loc.path, diff_blocks, total_blocks, ((diff_blocks * 1.0)/total_blocks) * 100); } - if (sh_frame == last_loop_frame) - sh->old_loop_frame = NULL; - else - sh->old_loop_frame = last_loop_frame; + sh->old_loop_frame = last_loop_frame; local->self_heal.algo_completion_cbk (sh_frame, this); } @@ -156,17 +142,10 @@ sh_loop_finish (call_frame_t *loop_frame, xlator_t *this) loop_sh = &loop_local->self_heal; } - if (loop_sh && loop_sh->loop_completion_cbk) { - if (loop_sh->data_lock_held) { - afr_sh_data_unlock (loop_frame, this, - loop_sh->loop_completion_cbk); - } else { - loop_sh->loop_completion_cbk (loop_frame, this); - } + if (loop_sh && loop_sh->data_lock_held) { + afr_sh_data_unlock (loop_frame, this, this->name, + sh_destroy_frame); } else { - //default loop_completion_cbk destroys the loop_frame - if (loop_sh && !loop_sh->loop_completion_cbk) - GF_ASSERT (!loop_sh->data_lock_held); sh_destroy_frame (loop_frame, this); } out: @@ -185,7 +164,7 @@ sh_loop_lock_success (call_frame_t *loop_frame, xlator_t *this) sh_loop_finish (loop_sh->old_loop_frame, this); loop_sh->old_loop_frame = NULL; - gf_log (this->name, GF_LOG_DEBUG, "Aquired lock for range %"PRIu64 + gf_log (this->name, GF_LOG_DEBUG, "Acquired lock for range %"PRIu64 " %"PRIu64, loop_sh->offset, loop_sh->block_size); loop_sh->data_lock_held = _gf_true; loop_sh->sh_data_algo_start (loop_frame, this); @@ -205,16 +184,15 @@ sh_loop_lock_failure (call_frame_t *loop_frame, xlator_t *this) gf_log (this->name, GF_LOG_ERROR, "failed lock for range %"PRIu64 " %"PRIu64, loop_sh->offset, loop_sh->block_size); - if (loop_sh->old_loop_frame != loop_sh->sh_frame) - sh_loop_finish (loop_sh->old_loop_frame, this); + sh_loop_finish (loop_sh->old_loop_frame, this); loop_sh->old_loop_frame = NULL; sh_loop_return (sh_frame, this, loop_frame, -1, ENOTCONN); return 0; } static int -sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, - call_frame_t *old_loop_frame) +sh_loop_frame_create (call_frame_t *sh_frame, xlator_t *this, + call_frame_t *old_loop_frame, call_frame_t **loop_frame) { call_frame_t *new_loop_frame = NULL; afr_local_t *local = NULL; @@ -224,7 +202,9 @@ sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, afr_private_t *priv = NULL; GF_ASSERT (sh_frame); + GF_ASSERT (loop_frame); + *loop_frame = NULL; local = sh_frame->local; sh = &local->self_heal; priv = this->private; @@ -232,8 +212,9 @@ sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, new_loop_frame = copy_frame (sh_frame); if (!new_loop_frame) goto out; - //We want the frame to have same lk_oner as sh_frame - new_loop_local = afr_local_copy (local, this); + //We want the frame to have same lk_owner as sh_frame + //so that locks translator allows conflicting locks + new_loop_local = afr_self_heal_local_init (local, this); if (!new_loop_local) goto out; new_loop_frame->local = new_loop_local; @@ -248,30 +229,55 @@ sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, gf_afr_mt_char); if (!new_loop_sh->write_needed) goto out; - new_loop_sh->checksum = GF_CALLOC (priv->child_count, MD5_DIGEST_LEN, + new_loop_sh->checksum = GF_CALLOC (priv->child_count, MD5_DIGEST_LENGTH, gf_afr_mt_uint8_t); if (!new_loop_sh->checksum) goto out; - new_loop_sh->offset = offset; - new_loop_sh->block_size = sh->block_size; new_loop_sh->inode = inode_ref (sh->inode); new_loop_sh->sh_data_algo_start = sh->sh_data_algo_start; new_loop_sh->source = sh->source; new_loop_sh->active_sinks = sh->active_sinks; new_loop_sh->healing_fd = fd_ref (sh->healing_fd); new_loop_sh->file_has_holes = sh->file_has_holes; - new_loop_sh->loop_completion_cbk = sh_destroy_frame; new_loop_sh->old_loop_frame = old_loop_frame; new_loop_sh->sh_frame = sh_frame; + *loop_frame = new_loop_frame; + return 0; +out: + sh_destroy_frame (new_loop_frame, this); + return -ENOMEM; +} + +static int +sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, + call_frame_t *old_loop_frame) +{ + call_frame_t *new_loop_frame = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_local_t *new_loop_local = NULL; + afr_self_heal_t *new_loop_sh = NULL; + int ret = 0; + + GF_ASSERT (sh_frame); + + local = sh_frame->local; + sh = &local->self_heal; + + ret = sh_loop_frame_create (sh_frame, this, old_loop_frame, + &new_loop_frame); + if (ret) + goto out; + new_loop_local = new_loop_frame->local; + new_loop_sh = &new_loop_local->self_heal; + new_loop_sh->offset = offset; + new_loop_sh->block_size = sh->block_size; afr_sh_data_lock (new_loop_frame, this, offset, new_loop_sh->block_size, - sh_loop_lock_success, sh_loop_lock_failure); + _gf_true, this->name, sh_loop_lock_success, sh_loop_lock_failure); return 0; out: - sh->op_failed = 1; - if (new_loop_frame) { - new_loop_frame->local = new_loop_local; - } - if (old_loop_frame != sh_frame) + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + if (old_loop_frame) sh_loop_finish (old_loop_frame, this); sh_loop_return (sh_frame, this, new_loop_frame, -1, ENOMEM); return 0; @@ -281,7 +287,6 @@ static int sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, gf_boolean_t is_first_call, call_frame_t *old_loop_frame) { - afr_private_t * priv = NULL; afr_local_t * local = NULL; afr_self_heal_t * sh = NULL; afr_sh_algo_private_t *sh_priv = NULL; @@ -289,6 +294,7 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, blksize_t block_size = 0; int loop = 0; off_t offset = 0; + afr_private_t *priv = NULL; priv = this->private; local = sh_frame->local; @@ -297,19 +303,20 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, LOCK (&sh_priv->lock); { - if (_gf_false == is_first_call) + if (!is_first_call) sh_priv->loops_running--; offset = sh_priv->offset; block_size = sh->block_size; - while ((!sh->eof_reached) && (0 == sh->op_failed) && - (sh_priv->loops_running < priv->data_self_heal_window_size) + while ((!sh->eof_reached) && + (!is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) && + (sh_priv->loops_running < priv->data_self_heal_window_size) && (sh_priv->offset < sh->file_size)) { loop++; sh_priv->offset += block_size; sh_priv->loops_running++; - if (_gf_false == is_first_call) + if (!is_first_call) break; } if (0 == sh_priv->loops_running) { @@ -321,7 +328,8 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, if (0 == loop) { //loop finish does unlock, but the erasing of the pending //xattrs needs to happen before that so do not finish the loop - if (is_driver_done && !sh->op_failed) + if (is_driver_done && + !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) goto driver_done; if (old_loop_frame) { sh_loop_finish (old_loop_frame, this); @@ -332,7 +340,7 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, //If we have more loops to form we should finish previous loop after //the next loop lock while (loop--) { - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { // op failed in other loop, stop spawning more loops if (old_loop_frame) { sh_loop_finish (old_loop_frame, this); @@ -369,7 +377,6 @@ sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame sh = &sh_local->self_heal; if (loop_frame) { - GF_ASSERT (loop_frame != sh_frame); loop_local = loop_frame->local; if (loop_local) loop_sh = &loop_local->self_heal; @@ -379,7 +386,7 @@ sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame } if (op_ret == -1) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); if (loop_frame) { sh_loop_finish (loop_frame, this); @@ -395,7 +402,7 @@ sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame static int sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * loop_local = NULL; @@ -427,13 +434,22 @@ sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (loop_sh, op_errno); + } else if (op_ret < loop_local->cont.writev.vector->iov_len) { + gf_log (this->name, GF_LOG_ERROR, + "incomplete write to %s on subvolume %s " + "(expected %lu, returned %d)", sh_local->loc.path, + priv->children[child_index]->name, + loop_local->cont.writev.vector->iov_len, op_ret); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } call_count = afr_frame_return (loop_frame); if (call_count == 0) { + iobref_unref(loop_local->cont.writev.iobref); + sh_loop_return (sh_frame, this, loop_frame, loop_sh->op_ret, loop_sh->op_errno); } @@ -441,12 +457,41 @@ sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, return 0; } +static void +sh_prune_writes_needed (call_frame_t *sh_frame, call_frame_t *loop_frame, + afr_private_t *priv) +{ + afr_local_t *sh_local = NULL; + afr_self_heal_t *sh = NULL; + afr_local_t *loop_local = NULL; + afr_self_heal_t *loop_sh = NULL; + int i = 0; + + sh_local = sh_frame->local; + sh = &sh_local->self_heal; + + if (!strcmp (sh->algo->name, "diff")) + return; + + loop_local = loop_frame->local; + loop_sh = &loop_local->self_heal; + + /* full self-heal guarantees there exists atleast 1 file with size 0 + * That means for other files we can preserve holes that come after + * its size before 'trim' + */ + for (i = 0; i < priv->child_count; i++) { + if (loop_sh->write_needed[i] && + ((loop_sh->offset + 1) > sh->buf[i].ia_size)) + loop_sh->write_needed[i] = 0; + } +} static int sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, int32_t count, struct iatt *buf, - struct iobref *iobref) + struct iobref *iobref, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * loop_local = NULL; @@ -471,7 +516,7 @@ sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, if (op_ret <= 0) { if (op_ret < 0) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); gf_log (this->name, GF_LOG_ERROR, "read failed on %d " "for %s reason :%s", sh->source, sh_local->loc.path, strerror (errno)); @@ -484,18 +529,26 @@ sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, goto out; } - if (loop_sh->file_has_holes && iov_0filled (vector, count) == 0) { - gf_log (this->name, GF_LOG_DEBUG, "0 filled block"); - sh_loop_return (sh_frame, this, loop_frame, - op_ret, op_errno); - goto out; - } + if (loop_sh->file_has_holes && iov_0filled (vector, count) == 0) + sh_prune_writes_needed (sh_frame, loop_frame, priv); call_count = sh_number_of_writes_needed (loop_sh->write_needed, priv->child_count); - GF_ASSERT (call_count > 0); + if (call_count == 0) { + sh_loop_return (sh_frame, this, loop_frame, 0, 0); + goto out; + } + loop_local->call_count = call_count; + /* + * We only really need the request size at the moment, but the buffer + * is required if we want to issue a retry in the event of a short write. + * Therefore, we duplicate the vector and ref the iobref here... + */ + loop_local->cont.writev.vector = iov_dup(vector, count); + loop_local->cont.writev.iobref = iobref_ref(iobref); + for (i = 0; i < priv->child_count; i++) { if (!loop_sh->write_needed[i]) continue; @@ -504,7 +557,7 @@ sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, priv->children[i], priv->children[i]->fops->writev, loop_sh->healing_fd, vector, count, - loop_sh->offset, iobref); + loop_sh->offset, 0, iobref, NULL); if (!--call_count) break; @@ -531,7 +584,7 @@ sh_loop_read (call_frame_t *loop_frame, xlator_t *this) priv->children[loop_sh->source], priv->children[loop_sh->source]->fops->readv, loop_sh->healing_fd, loop_sh->block_size, - loop_sh->offset); + loop_sh->offset, 0, NULL); return 0; } @@ -540,7 +593,8 @@ sh_loop_read (call_frame_t *loop_frame, xlator_t *this) static int sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - uint32_t weak_checksum, uint8_t *strong_checksum) + uint32_t weak_checksum, uint8_t *strong_checksum, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *loop_local = NULL; @@ -572,10 +626,10 @@ sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, "checksum on %s failed on subvolume %s (%s)", sh_local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } else { - memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LEN, - strong_checksum, MD5_DIGEST_LEN); + memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LENGTH, + strong_checksum, MD5_DIGEST_LENGTH); } call_count = afr_frame_return (loop_frame); @@ -585,9 +639,9 @@ sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, if (sh->sources[i] || !sh_local->child_up[i]) continue; - if (memcmp (loop_sh->checksum + (i * MD5_DIGEST_LEN), - loop_sh->checksum + (sh->source * MD5_DIGEST_LEN), - MD5_DIGEST_LEN)) { + if (memcmp (loop_sh->checksum + (i * MD5_DIGEST_LENGTH), + loop_sh->checksum + (sh->source * MD5_DIGEST_LENGTH), + MD5_DIGEST_LENGTH)) { /* Checksums differ, so this block must be written to this sink @@ -610,7 +664,8 @@ sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, } UNLOCK (&sh_priv->lock); - if (write_needed && !sh->op_failed) { + if (write_needed && + !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { sh_loop_read (loop_frame, this); } else { sh_loop_return (sh_frame, this, loop_frame, @@ -643,7 +698,7 @@ sh_diff_checksum (call_frame_t *loop_frame, xlator_t *this) priv->children[loop_sh->source], priv->children[loop_sh->source]->fops->rchecksum, loop_sh->healing_fd, - loop_sh->offset, loop_sh->block_size); + loop_sh->offset, loop_sh->block_size, NULL); for (i = 0; i < priv->child_count; i++) { if (loop_sh->sources[i] || !loop_local->child_up[i]) @@ -654,7 +709,7 @@ sh_diff_checksum (call_frame_t *loop_frame, xlator_t *this) priv->children[i], priv->children[i]->fops->rchecksum, loop_sh->healing_fd, - loop_sh->offset, loop_sh->block_size); + loop_sh->offset, loop_sh->block_size, NULL); if (!--call_count) break; @@ -684,9 +739,42 @@ sh_full_read_write_to_sinks (call_frame_t *loop_frame, xlator_t *this) return 0; } -static int -sh_do_nothing (call_frame_t *frame, xlator_t *this) +afr_sh_algo_private_t* +afr_sh_priv_init () +{ + afr_sh_algo_private_t *sh_priv = NULL; + + sh_priv = GF_CALLOC (1, sizeof (*sh_priv), + gf_afr_mt_afr_private_t); + if (!sh_priv) + goto out; + + LOCK_INIT (&sh_priv->lock); +out: + return sh_priv; +} + +int +afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, char *dom, + unsigned int child_count) { + afr_local_t *dst_local = NULL; + afr_self_heal_t *dst_sh = NULL; + afr_local_t *src_local = NULL; + afr_self_heal_t *src_sh = NULL; + int ret = -1; + + dst_local = dst->local; + dst_sh = &dst_local->self_heal; + src_local = src->local; + src_sh = &src_local->self_heal; + GF_ASSERT (src_sh->data_lock_held); + GF_ASSERT (!dst_sh->data_lock_held); + ret = afr_lk_transfer_datalock (dst, src, dom, child_count); + if (ret) + return ret; + src_sh->data_lock_held = _gf_false; + dst_sh->data_lock_held = _gf_true; return 0; } @@ -694,31 +782,37 @@ int afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this, afr_sh_algo_fn sh_data_algo_start) { - afr_local_t *sh_local = NULL; + call_frame_t *first_loop_frame = NULL; + afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; - afr_sh_algo_private_t *sh_priv = NULL; + int ret = 0; + afr_private_t *priv = NULL; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; + local = sh_frame->local; + sh = &local->self_heal; + priv = this->private; - sh_priv = GF_CALLOC (1, sizeof (*sh_priv), - gf_afr_mt_afr_private_t); - if (!sh_priv) { - sh->op_failed = 1; - sh_loop_driver_done (sh_frame, this, NULL); + sh->sh_data_algo_start = sh_data_algo_start; + local->call_count = 0; + ret = sh_loop_frame_create (sh_frame, this, NULL, &first_loop_frame); + if (ret) + goto out; + ret = afr_sh_transfer_lock (first_loop_frame, sh_frame, this->name, + priv->child_count); + if (ret) + goto out; + sh->private = afr_sh_priv_init (); + if (!sh->private) { + ret = -1; goto out; } - - LOCK_INIT (&sh_priv->lock); - - sh->private = sh_priv; - sh->sh_data_algo_start = sh_data_algo_start; - - sh_local->call_count = 0; - - sh->loop_completion_cbk = sh_do_nothing; - sh_loop_driver (sh_frame, this, _gf_true, sh_frame); + sh_loop_driver (sh_frame, this, _gf_true, first_loop_frame); + ret = 0; out: + if (ret) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + sh_loop_driver_done (sh_frame, this, NULL); + } return 0; } diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.h b/xlators/cluster/afr/src/afr-self-heal-algorithm.h index 04d8e8a6c..6b20789b1 100644 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.h +++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.h @@ -1,26 +1,16 @@ /* - Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __AFR_SELF_HEAL_ALGORITHM_H__ #define __AFR_SELF_HEAL_ALGORITHM_H__ - typedef int (*afr_sh_algo_fn) (call_frame_t *frame, xlator_t *this); diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index b11be3872..ef92b4205 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include "glusterfs.h" @@ -27,6 +18,52 @@ #include "afr-self-heal.h" #include "pump.h" +#define ADD_FMT_STRING(msg, off, sh_str, status, print_log) \ + do { \ + if (AFR_SELF_HEAL_NOT_ATTEMPTED != status) { \ + off += snprintf (msg + off, sizeof (msg) - off, \ + " "sh_str" self heal %s,", \ + get_sh_completion_status (status));\ + print_log = 1; \ + } \ + } while (0) + +#define ADD_FMT_STRING_SYNC(msg, off, sh_str, status, print_log) \ + do { \ + if (AFR_SELF_HEAL_SYNC_BEGIN == status || \ + AFR_SELF_HEAL_FAILED == status) { \ + off += snprintf (msg + off, sizeof (msg) - off, \ + " "sh_str" self heal %s,", \ + get_sh_completion_status (status));\ + print_log = 1; \ + } \ + } while (0) + + +void +afr_sh_reset (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + memset (sh->child_errno, 0, + sizeof (*sh->child_errno) * priv->child_count); + memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count); + memset (sh->parentbufs, 0, + sizeof (*sh->parentbufs) * priv->child_count); + memset (sh->success, 0, sizeof (*sh->success) * priv->child_count); + memset (sh->locked_nodes, 0, + sizeof (*sh->locked_nodes) * priv->child_count); + sh->active_sinks = 0; + + afr_reset_xattr (sh->xattr, priv->child_count); +} + //Intersection[child]=1 if child is part of intersection void afr_children_intersection_get (int32_t *set1, int32_t *set2, @@ -81,21 +118,6 @@ afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this) sh->active_sinks = active_sinks; } -/** - * sink_count - return number of sinks in sources array - */ - -int -afr_sh_sink_count (int sources[], int child_count) -{ - int i = 0; - int sinks = 0; - for (i = 0; i < child_count; i++) - if (!sources[i]) - sinks++; - return sinks; -} - int afr_sh_source_count (int sources[], int child_count) { @@ -112,8 +134,8 @@ void afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno) { sh->op_ret = -1; - if (afr_error_more_important (sh->op_errno, op_errno)) - sh->op_errno = op_errno; + sh->op_errno = afr_most_important_error(sh->op_errno, op_errno, + _gf_false); } void @@ -135,13 +157,85 @@ afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); } sprintf (ptr, "]"); - gf_log (this->name, GF_LOG_DEBUG, - "pending_matrix: %s", buf); + gf_log (this->name, GF_LOG_DEBUG, "pending_matrix: %s", buf); } GF_FREE (buf); } +char* +afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this) +{ + afr_private_t * priv = this->private; + char *buf = NULL; + char *ptr = NULL; + int i = 0; + int j = 0; + int child_count = priv->child_count; + char *matrix_begin = "[ [ "; + char *matrix_end = "] ]"; + char *seperator = "] [ "; + int pending_entry_strlen = 12; //Including space after entry + int matrix_begin_strlen = 0; + int matrix_end_strlen = 0; + int seperator_strlen = 0; + int string_length = 0; + char *msg = "- Pending matrix: "; + + /* + * for a list of lists of [ [ a b ] [ c d ] ] + * */ + + matrix_begin_strlen = strlen (matrix_begin); + matrix_end_strlen = strlen (matrix_end); + seperator_strlen = strlen (seperator); + string_length = matrix_begin_strlen + matrix_end_strlen + + (child_count -1) * seperator_strlen + + (child_count * child_count * pending_entry_strlen); + + buf = GF_CALLOC (1, 1 + strlen (msg) + string_length , gf_afr_mt_char); + if (!buf) + goto out; + + ptr = buf; + ptr += sprintf (ptr, "%s", msg); + ptr += sprintf (ptr, "%s", matrix_begin); + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); + } + if (i < priv->child_count -1) + ptr += sprintf (ptr, "%s", seperator); + } + + ptr += sprintf (ptr, "%s", matrix_end); + +out: + return buf; +} + +void +afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, + const char *loc) +{ + char *buf = NULL; + char *free_ptr = NULL; + + buf = afr_get_pending_matrix_str (pending_matrix, this); + if (buf) + free_ptr = buf; + else + buf = ""; + + + gf_log (this->name, GF_LOG_ERROR, "Unable to self-heal contents of '%s'" + " (possible split-brain). Please delete the file from all but " + "the preferred subvolume.%s", loc, buf); + GF_FREE (free_ptr); + return; +} + + void afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count) { @@ -180,6 +274,7 @@ afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix, int afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, + unsigned char *ignorant_subvols, dict_t *xattr[], afr_transaction_type type, size_t child_count) { @@ -190,12 +285,6 @@ afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, int i = 0; int j = 0; int k = 0; - unsigned char *ignorant_subvols = NULL; - - ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), child_count, - gf_afr_mt_char); - if (NULL == ignorant_subvols) - goto out; afr_init_pending_matrix (pending_matrix, child_count); @@ -213,7 +302,8 @@ afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, * subvolume. */ - ignorant_subvols[i] = 1; + if (ignorant_subvols) + ignorant_subvols[i] = 1; continue; } @@ -224,19 +314,14 @@ afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, } } - afr_mark_ignorant_subvols_as_pending (pending_matrix, - ignorant_subvols, - child_count); - GF_FREE (ignorant_subvols); -out: return ret; } typedef enum { + AFR_NODE_INVALID, AFR_NODE_INNOCENT, AFR_NODE_FOOL, AFR_NODE_WISE, - AFR_NODE_INVALID = -1, } afr_node_type; typedef struct { @@ -316,7 +401,7 @@ afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count) * It is 1 if no other wise node accuses it. * Only wise nodes with wisdom 1 are sources. * - * If no nodes with wisdom 1 exist, a split-brain has occured. + * If no nodes with wisdom 1 exist, a split-brain has occurred. */ static void @@ -416,6 +501,8 @@ afr_find_biggest_witness_among_fools (int32_t *witnesses, { int i = 0; int biggest_witness = -1; + int biggest_witness_idx = -1; + int biggest_witness_cnt = -1; GF_ASSERT (witnesses); GF_ASSERT (characters); @@ -425,10 +512,21 @@ afr_find_biggest_witness_among_fools (int32_t *witnesses, if (characters[i].type != AFR_NODE_FOOL) continue; - if (biggest_witness < witnesses[i]) + if (biggest_witness < witnesses[i]) { biggest_witness = witnesses[i]; + biggest_witness_idx = i; + biggest_witness_cnt = 1; + continue; + } + + if (biggest_witness == witnesses[i]) + biggest_witness_cnt++; } - return biggest_witness; + + if (biggest_witness_cnt != 1) + return -1; + + return biggest_witness_idx; } int @@ -456,10 +554,84 @@ afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses, return nsources; } + +int +afr_mark_fool_as_source_by_idx (int32_t *sources, int child_count, int idx) +{ + if (idx >= 0 && idx < child_count) { + sources[idx] = 1; + return 1; + } + return 0; +} + + +static int +afr_find_largest_file_size (struct iatt *bufs, int32_t *success_children, + int child_count) +{ + int idx = -1; + int i = -1; + int child = -1; + uint64_t max_size = 0; + uint64_t min_size = 0; + int num_children = 0; + + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + + child = success_children[i]; + if (bufs[child].ia_size > max_size) { + max_size = bufs[child].ia_size; + idx = child; + } + + if ((num_children == 0) || (bufs[child].ia_size < min_size)) { + min_size = bufs[child].ia_size; + } + + num_children++; + } + + /* If sizes are same for all of them, finding sources will have to + * happen with pending changelog. So return -1 + */ + if ((num_children > 1) && (min_size == max_size)) + return -1; + return idx; +} + + +static int +afr_find_newest_file (struct iatt *bufs, int32_t *success_children, + int child_count) +{ + int idx = -1; + int i = -1; + int child = -1; + uint64_t max_ctime = 0; + + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + + child = success_children[i]; + if (bufs[child].ia_ctime > max_ctime) { + max_ctime = bufs[child].ia_ctime; + idx = child; + } + } + + return idx; +} + + static int afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, afr_node_character *characters, - int child_count) + int32_t *success_children, + int child_count, struct iatt *bufs) { int32_t biggest_witness = 0; int nsources = 0; @@ -467,6 +639,11 @@ afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, GF_ASSERT (child_count > 0); + biggest_witness = afr_find_largest_file_size (bufs, success_children, + child_count); + if (biggest_witness != -1) + goto found; + witnesses = GF_CALLOC (child_count, sizeof (*witnesses), gf_afr_mt_int32_t); if (NULL == witnesses) { @@ -479,34 +656,34 @@ afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, biggest_witness = afr_find_biggest_witness_among_fools (witnesses, characters, child_count); - nsources = afr_mark_fool_as_source_by_witness (sources, witnesses, - characters, child_count, - biggest_witness); + if (biggest_witness != -1) + goto found; + + biggest_witness = afr_find_newest_file (bufs, success_children, + child_count); + +found: + nsources = afr_mark_fool_as_source_by_idx (sources, child_count, + biggest_witness); out: - if (witnesses) - GF_FREE (witnesses); + GF_FREE (witnesses); return nsources; } int afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs, - int32_t *valid_children, int child_count, - uint32_t uid) + int32_t *success_children, + unsigned int child_count, uint32_t uid) { int i = 0; int nsources = 0; int child = 0; - GF_ASSERT (bufs); - GF_ASSERT (valid_children); - GF_ASSERT (sources); - GF_ASSERT (child_count > 0); - for (i = 0; i < child_count; i++) { - if (-1 == valid_children[i]) - continue; + if (-1 == success_children[i]) + break; - child = valid_children[i]; + child = success_children[i]; if (uid == bufs[child].ia_uid) { sources[child] = 1; nsources++; @@ -516,21 +693,17 @@ afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs, } int -afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *valid_children, - int child_count) +afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *success_children, + unsigned int child_count) { int i = 0; int smallest = -1; int child = 0; - GF_ASSERT (bufs); - GF_ASSERT (valid_children); - GF_ASSERT (child_count > 0); - for (i = 0; i < child_count; i++) { - if (-1 == valid_children[i]) - continue; - child = valid_children[i]; + if (-1 == success_children[i]) + break; + child = success_children[i]; if ((smallest == -1) || (bufs[child].ia_uid < bufs[smallest].ia_uid)) { smallest = child; @@ -540,25 +713,97 @@ afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *valid_children, } static int -afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *valid_children, +afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *success_children, int child_count, int32_t *sources) { int nsources = 0; int smallest = 0; - smallest = afr_get_child_with_lowest_uid (bufs, valid_children, + smallest = afr_get_child_with_lowest_uid (bufs, success_children, child_count); if (smallest < 0) { nsources = -1; goto out; } nsources = afr_mark_child_as_source_by_uid (sources, bufs, - valid_children, child_count, + success_children, child_count, bufs[smallest].ia_uid); out: return nsources; } +int +afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children, + struct iatt *bufs) +{ + afr_private_t *priv = NULL; + int i = 0; + int child = -1; + int read_child = -1; + + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + child = success_children[i]; + if (child < 0) + break; + if (read_child < 0) + read_child = child; + else if (bufs[read_child].ia_size < bufs[child].ia_size) + read_child = child; + } + return read_child; +} + +int +afr_sh_mark_zero_size_file_as_sink (struct iatt *bufs, int32_t *success_children, + int child_count, int32_t *sources) +{ + int nsources = 0; + int i = 0; + int child = 0; + gf_boolean_t sink_exists = _gf_false; + gf_boolean_t source_exists = _gf_false; + int source = -1; + + for (i = 0; i < child_count; i++) { + child = success_children[i]; + if (child < 0) + break; + if (!bufs[child].ia_size) { + sink_exists = _gf_true; + continue; + } + if (!source_exists) { + source_exists = _gf_true; + source = child; + continue; + } + if (bufs[source].ia_size != bufs[child].ia_size) { + nsources = -1; + goto out; + } + } + if (!source_exists && !sink_exists) { + nsources = -1; + goto out; + } + + if (!source_exists || !sink_exists) + goto out; + + for (i = 0; i < child_count; i++) { + child = success_children[i]; + if (child < 0) + break; + if (bufs[child].ia_size) { + sources[child] = 1; + nsources++; + } + } +out: + return nsources; +} + char * afr_get_character_str (afr_node_type type) { @@ -583,12 +828,10 @@ afr_get_character_str (afr_node_type type) afr_node_type afr_find_child_character_type (int32_t *pending_row, int32_t child, - int32_t child_count, const char *xlator_name) + unsigned int child_count) { afr_node_type type = AFR_NODE_INVALID; - GF_ASSERT (pending_row); - GF_ASSERT (child_count > 0); GF_ASSERT ((child >= 0) && (child < child_count)); if (afr_sh_is_innocent (pending_row, child_count)) @@ -597,44 +840,85 @@ afr_find_child_character_type (int32_t *pending_row, int32_t child, type = AFR_NODE_FOOL; else if (afr_sh_is_wise (pending_row, child, child_count)) type = AFR_NODE_WISE; - else - GF_ASSERT (0); - - gf_log (xlator_name, GF_LOG_DEBUG, "child %d character %s", - child, afr_get_character_str (type)); return type; } int afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, int32_t **pending_matrix, int32_t *sources, - int32_t *success_children, afr_transaction_type type) + int32_t *success_children, afr_transaction_type type, + int32_t *subvol_status, gf_boolean_t ignore_ignorant) { afr_private_t *priv = NULL; afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; int nsources = -1; + unsigned char *ignorant_subvols = NULL; + unsigned int child_count = 0; priv = this->private; + child_count = priv->child_count; if (afr_get_children_count (success_children, priv->child_count) == 0) goto out; + if (!ignore_ignorant) { + ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), + child_count, gf_afr_mt_char); + if (NULL == ignorant_subvols) + goto out; + } + afr_build_pending_matrix (priv->pending_key, pending_matrix, - xattr, type, priv->child_count); + ignorant_subvols, xattr, type, + priv->child_count); + if (!ignore_ignorant) + afr_mark_ignorant_subvols_as_pending (pending_matrix, + ignorant_subvols, + priv->child_count); sh_type = afr_self_heal_type_for_transaction (type); if (AFR_SELF_HEAL_INVALID == sh_type) goto out; afr_sh_print_pending_matrix (pending_matrix, this); - nsources = afr_mark_sources (sources, pending_matrix, bufs, - priv->child_count, sh_type, - success_children, this->name); + nsources = afr_mark_sources (this, sources, pending_matrix, bufs, + sh_type, success_children, subvol_status); out: + GF_FREE (ignorant_subvols); return nsources; } +void +afr_find_character_types (afr_node_character *characters, + int32_t **pending_matrix, int32_t *success_children, + unsigned int child_count) +{ + afr_node_type type = AFR_NODE_INVALID; + int child = 0; + int i = 0; + + for (i = 0; i < child_count; i++) { + child = success_children[i]; + if (child == -1) + break; + type = afr_find_child_character_type (pending_matrix[child], + child, child_count); + characters[child].type = type; + } +} + +void +afr_mark_success_children_sources (int32_t *sources, int32_t *success_children, + unsigned int child_count) +{ + int i = 0; + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + sources[success_children[i]] = 1; + } +} /** * mark_sources: Mark all 'source' nodes and return number of source * nodes found @@ -660,17 +944,18 @@ out: */ int -afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, - int32_t child_count, afr_self_heal_type type, - int32_t *valid_children, const char *xlator_name) +afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, + struct iatt *bufs, afr_self_heal_type type, + int32_t *success_children, int32_t *subvol_status) { /* stores the 'characters' (innocent, fool, wise) of the nodes */ - afr_node_character *characters = NULL; - int i = 0; - int nsources = -1; - xlator_t *this = NULL; + int nsources = -1; + unsigned int child_count = 0; + afr_private_t *priv = NULL; + priv = this->private; + child_count = priv->child_count; characters = GF_CALLOC (sizeof (afr_node_character), child_count, gf_afr_mt_afr_node_character); if (!characters) @@ -679,28 +964,29 @@ afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, this = THIS; /* start clean */ - for (i = 0; i < child_count; i++) { - sources[i] = 0; - } - + memset (sources, 0, sizeof (*sources) * child_count); nsources = 0; - for (i = 0; i < child_count; i++) { - characters[i].type = - afr_find_child_character_type (pending_matrix[i], i, - child_count, - xlator_name); - if (AFR_NODE_INVALID == characters[i].type) - gf_log (xlator_name, GF_LOG_WARNING, - "child %d had invalid xattrs", i); - } - - if ((type == AFR_SELF_HEAL_METADATA) - && afr_sh_all_nodes_innocent (characters, child_count)) { - - nsources = afr_sh_mark_lowest_uid_as_source (bufs, - valid_children, + afr_find_character_types (characters, pending_matrix, success_children, + child_count); + if (afr_sh_all_nodes_innocent (characters, child_count)) { + switch (type) { + case AFR_SELF_HEAL_METADATA: + nsources = afr_sh_mark_lowest_uid_as_source (bufs, + success_children, + child_count, + sources); + break; + case AFR_SELF_HEAL_DATA: + nsources = afr_sh_mark_zero_size_file_as_sink (bufs, + success_children, child_count, sources); + if ((nsources < 0) && subvol_status) + *subvol_status |= SPLIT_BRAIN; + break; + default: + break; + } goto out; } @@ -708,32 +994,29 @@ afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, afr_sh_compute_wisdom (pending_matrix, characters, child_count); if (afr_sh_wise_nodes_conflict (characters, child_count)) { - /* split-brain */ - gf_log (this->name, GF_LOG_INFO, - "split-brain possible, no source detected"); + if (subvol_status) + *subvol_status |= SPLIT_BRAIN; nsources = -1; - } else { nsources = afr_sh_mark_wisest_as_sources (sources, characters, child_count); } } else { + if (subvol_status) + *subvol_status |= ALL_FOOLS; nsources = afr_mark_biggest_of_fools_as_source (sources, pending_matrix, characters, - child_count); + success_children, + child_count, bufs); } out: - if (nsources == 0) { - for (i = 0; i < child_count; i++) { - if (valid_children[i] != -1) - sources[valid_children[i]] = 1; - } - } - if (characters) - GF_FREE (characters); + if (nsources == 0) + afr_mark_success_children_sources (sources, success_children, + child_count); + GF_FREE (characters); gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources); return nsources; @@ -744,81 +1027,108 @@ afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, int32_t *delta_matrix[], unsigned char success[], int child_count, afr_transaction_type type) { - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - int ret = 0; - int i = 0; - int j = 0; - int k = 0; - - /* start clean */ - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - delta_matrix[i][j] = 0; - } - } - - for (i = 0; i < child_count; i++) { - if (pending_raw) - pending_raw = NULL; + int tgt = 0; + int src = 0; + int value = 0; - for (j = 0; j < child_count; j++) { - ret = dict_get_ptr (xattr[i], priv->pending_key[j], - &pending_raw); - if (ret < 0) - gf_log (THIS->name, GF_LOG_DEBUG, - "Unable to get dict value."); - if (!success[j]) - continue; - - k = afr_index_for_transaction_type (type); + afr_build_pending_matrix (priv->pending_key, delta_matrix, NULL, + xattr, type, priv->child_count); - if (pending_raw != NULL) { - memcpy (pending, pending_raw, sizeof(pending)); - delta_matrix[i][j] = -(ntoh32 (pending[k])); - } else { - delta_matrix[i][j] = 0; + /* + * The algorithm here has two parts. First, for each subvol indexed + * as tgt, we try to figure out what count everyone should have for it. + * If the self-heal succeeded, that's easy; the value is zero. + * Otherwise, the value is the maximum of the succeeding nodes' counts. + * Once we know the value, we loop through (possibly for a second time) + * setting each count to the difference so that when we're done all + * succeeding nodes will have the same count for tgt. + */ + for (tgt = 0; tgt < priv->child_count; ++tgt) { + value = 0; + if (!success[tgt]) { + /* Find the maximum. */ + for (src = 0; src < priv->child_count; ++src) { + if (!success[src]) { + continue; + } + if (delta_matrix[src][tgt] > value) { + value = delta_matrix[src][tgt]; + } + } + } + /* Force everyone who succeeded to the chosen value. */ + for (src = 0; src < priv->child_count; ++src) { + if (success[src]) { + delta_matrix[src][tgt] = value + - delta_matrix[src][tgt]; + } + else { + delta_matrix[src][tgt] = 0; } - } } } int -afr_sh_delta_to_xattr (afr_private_t *priv, +afr_sh_delta_to_xattr (xlator_t *this, int32_t *delta_matrix[], dict_t *xattr[], int child_count, afr_transaction_type type) { - int i = 0; - int j = 0; - int k = 0; - int ret = 0; - int32_t *pending = NULL; + int i = 0; + int j = 0; + int k = 0; + int ret = 0; + int32_t *pending = NULL; + int32_t *local_pending = NULL; + afr_private_t *priv = NULL; + priv = this->private; for (i = 0; i < child_count; i++) { if (!xattr[i]) continue; + local_pending = NULL; for (j = 0; j < child_count; j++) { pending = GF_CALLOC (sizeof (int32_t), 3, gf_afr_mt_int32_t); - if (!pending) + if (!pending) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate pending entry " + "for %s[%d] on %s", + priv->pending_key[j], type, + priv->children[i]->name); continue; + } /* 3 = data+metadata+entry */ k = afr_index_for_transaction_type (type); pending[k] = hton32 (delta_matrix[i][j]); + if (j == i) { + local_pending = pending; + continue; + } ret = dict_set_bin (xattr[i], priv->pending_key[j], pending, - 3 * sizeof (int32_t)); - if (ret < 0) - gf_log (THIS->name, GF_LOG_WARNING, + AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "Unable to set dict value."); + GF_FREE (pending); + } + } + if (local_pending) { + ret = dict_set_bin (xattr[i], priv->pending_key[i], + local_pending, + AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "Unable to set dict value."); + GF_FREE (local_pending); + } } } return 0; @@ -826,146 +1136,23 @@ afr_sh_delta_to_xattr (afr_private_t *priv, int -afr_sh_has_metadata_pending (dict_t *xattr, xlator_t *this) -{ - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - afr_private_t *priv = NULL; - int ret = -1; - int i = 0; - int j = 0; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); - - if (ret != 0) - return 0; - - memcpy (pending, pending_raw, sizeof(pending)); - j = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); - - if (pending[j]) - return 1; - } - - return 0; -} - - -int -afr_sh_has_data_pending (dict_t *xattr, xlator_t *this) -{ - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - afr_private_t *priv = NULL; - int ret = -1; - int i = 0; - int j = 0; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); - - if (ret != 0) - return 0; - - memcpy (pending, pending_raw, sizeof(pending)); - j = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - - if (pending[j]) - return 1; - } - - return 0; -} - - -int -afr_sh_has_entry_pending (dict_t *xattr, xlator_t *this) -{ - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - afr_private_t *priv = NULL; - int ret = -1; - int i = 0; - int j = 0; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); - - if (ret != 0) - return 0; - - memcpy (pending, pending_raw, sizeof(pending)); - j = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); - - if (pending[j]) - return 1; - } - - return 0; -} - - -/** - * is_matrix_zero - return true if pending matrix is all zeroes - */ - -int -afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count) -{ - int i = 0; - int j = 0; - - for (i = 0; i < child_count; i++) - for (j = 0; j < child_count; j++) - if (pending_matrix[i][j]) - return 0; - return 1; -} - - -int afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; local = frame->local; sh = &local->self_heal; - priv = this->private; - -// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); - memset (sh->buf, 0, sizeof (struct iatt) * priv->child_count); - for (i = 0; i < priv->child_count; i++) { - sh->locked_nodes[i] = 0; - } - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } + afr_sh_reset (frame, this); - if (local->govinda_gOvinda || sh->op_failed) { - gf_log (this->name, GF_LOG_INFO, + if (local->unhealable) { + gf_log (this->name, GF_LOG_DEBUG, "split brain found, aborting selfheal of %s", local->loc.path); - sh->op_failed = 1; + } + + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { sh->completion_cbk (frame, this); } else { gf_log (this->name, GF_LOG_TRACE, @@ -1051,7 +1238,7 @@ afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, sh->success_count++; sh->xattr[child_index] = dict_ref (xattr); } else { - gf_log (this->name, GF_LOG_ERROR, "path %s on subvolume" + gf_log (this->name, GF_LOG_DEBUG, "path %s on subvolume" " %s => -1 (%s)", loc->path, priv->children[child_index]->name, strerror (op_errno)); @@ -1082,8 +1269,7 @@ afr_valid_ia_type (ia_type_t ia_type) int afr_impunge_frame_create (call_frame_t *frame, xlator_t *this, - int active_source, int ret_child, mode_t entry_mode, - call_frame_t **impunge_frame) + int active_source, call_frame_t **impunge_frame) { afr_local_t *local = NULL; afr_local_t *impunge_local = NULL; @@ -1100,21 +1286,24 @@ afr_impunge_frame_create (call_frame_t *frame, xlator_t *this, goto out; } - ALLOC_OR_GOTO (impunge_local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (impunge_local, out); local = frame->local; new_frame->local = impunge_local; impunge_sh = &impunge_local->self_heal; impunge_sh->sh_frame = frame; impunge_sh->active_source = active_source; - impunge_sh->impunge_ret_child = ret_child; - impunge_sh->impunging_entry_mode = entry_mode; impunge_local->child_up = memdup (local->child_up, sizeof (*local->child_up) * priv->child_count); if (!impunge_local->child_up) goto out; + impunge_local->pending = afr_matrix_create (priv->child_count, + AFR_NUM_CHANGE_LOGS); + if (!impunge_local->pending) + goto out; + ret = afr_sh_common_create (impunge_sh, priv->child_count); if (ret) { op_errno = -ret; @@ -1129,54 +1318,89 @@ out: } void -afr_sh_call_entry_impunge_recreate (call_frame_t *frame, xlator_t *this, - int child_index, struct iatt *buf, - struct iatt *postparent, - afr_impunge_done_cbk_t impunge_done) +afr_sh_missing_entry_call_impunge_recreate (call_frame_t *frame, xlator_t *this, + struct iatt *buf, + struct iatt *postparent, + afr_impunge_done_cbk_t impunge_done) { call_frame_t *impunge_frame = NULL; afr_local_t *local = NULL; afr_local_t *impunge_local = NULL; afr_self_heal_t *sh = NULL; + afr_self_heal_t *impunge_sh = NULL; int ret = 0; - mode_t mode = 0; + unsigned int enoent_count = 0; + afr_private_t *priv = NULL; + int i = 0; + int32_t op_errno = 0; local = frame->local; sh = &local->self_heal; - mode = st_mode_from_ia (buf->ia_prot, buf->ia_type); - ret = afr_impunge_frame_create (frame, this, sh->source, child_index, - mode, &impunge_frame); + priv = this->private; + + enoent_count = afr_errno_count (NULL, sh->child_errno, + priv->child_count, ENOENT); + if (!enoent_count) { + gf_log (this->name, GF_LOG_INFO, + "no missing files - %s. proceeding to metadata check", + local->loc.path); + goto out; + } + sh->impunge_done = impunge_done; + ret = afr_impunge_frame_create (frame, this, sh->source, &impunge_frame); if (ret) goto out; impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; loc_copy (&impunge_local->loc, &local->loc); - sh->impunge_done = impunge_done; - impunge_local->call_count = 1; - afr_sh_entry_impunge_create (impunge_frame, this, child_index, buf, - postparent); + ret = afr_build_parent_loc (&impunge_sh->parent_loc, + &impunge_local->loc, &op_errno); + if (ret) { + ret = -op_errno; + goto out; + } + impunge_local->call_count = enoent_count; + impunge_sh->entrybuf = sh->buf[sh->source]; + impunge_sh->parentbuf = sh->parentbufs[sh->source]; + for (i = 0; i < priv->child_count; i++) { + if (!impunge_local->child_up[i]) { + impunge_sh->child_errno[i] = ENOTCONN; + continue; + } + if (sh->child_errno[i] != ENOENT) { + impunge_sh->child_errno[i] = EEXIST; + continue; + } + } + for (i = 0; i < priv->child_count; i++) { + if (sh->child_errno[i] != ENOENT) + continue; + afr_sh_entry_impunge_create (impunge_frame, this, i); + enoent_count--; + } + GF_ASSERT (!enoent_count); return; out: - gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, reason: %s", - local->loc.path, strerror (-ret)); - impunge_done (frame, this, child_index, -1, -ret); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, " + "reason: %s", local->loc.path, strerror (-ret)); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + } + afr_sh_missing_entries_finish (frame, this); } int -afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this, int child, +afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno) { - int call_count = 0; afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; local = frame->local; - - if (op_ret == -1) - gf_log (this->name, GF_LOG_ERROR, - "create entry %s failed, on child %d reason, %s", - local->loc.path, child, strerror (op_errno)); - call_count = afr_frame_return (frame); - if (call_count == 0) - afr_sh_missing_entries_finish (frame, this); + sh = &local->self_heal; + if (op_ret < 0) + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_missing_entries_finish (frame, this); return 0; } @@ -1186,26 +1410,11 @@ sh_missing_entries_create (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; int type = 0; - afr_private_t *priv = NULL; - int enoent_count = 0; - int i = 0; struct iatt *buf = NULL; struct iatt *postparent = NULL; local = frame->local; sh = &local->self_heal; - priv = this->private; - - enoent_count = afr_errno_count (NULL, sh->child_errno, - priv->child_count, ENOENT); - if (enoent_count == 0) { - gf_log (this->name, GF_LOG_INFO, - "no missing files - %s. proceeding to metadata check", - local->loc.path); - /* proceed to next step - metadata self-heal */ - afr_sh_missing_entries_finish (frame, this); - return 0; - } buf = &sh->buf[sh->source]; postparent = &sh->parentbufs[sh->source]; @@ -1214,22 +1423,14 @@ sh_missing_entries_create (call_frame_t *frame, xlator_t *this) if (!afr_valid_ia_type (type)) { gf_log (this->name, GF_LOG_ERROR, "%s: unknown file type: 0%o", local->loc.path, type); - local->govinda_gOvinda = 1; + afr_set_local_for_unhealable (local); afr_sh_missing_entries_finish (frame, this); goto out; } - local->call_count = enoent_count; - for (i = 0; i < priv->child_count; i++) { - //If !child_up errno will be zero - if (sh->child_errno[i] != ENOENT) - continue; - afr_sh_call_entry_impunge_recreate (frame, this, i, + afr_sh_missing_entry_call_impunge_recreate (frame, this, buf, postparent, afr_sh_create_entry_cbk); - enoent_count--; - } - GF_ASSERT (enoent_count == 0); out: return 0; } @@ -1243,30 +1444,59 @@ afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this, afr_private_t *priv = NULL; ia_type_t ia_type = IA_INVAL; int32_t nsources = 0; + loc_t *loc = NULL; + int32_t subvol_status = 0; + afr_transaction_type txn_type = AFR_DATA_TRANSACTION; + gf_boolean_t split_brain = _gf_false; + int read_child = -1; local = frame->local; sh = &local->self_heal; priv = this->private; + loc = &local->loc; if (op_ret < 0) { - if (op_errno == EIO) - local->govinda_gOvinda = 1; + if (op_errno == EIO) { + afr_set_local_for_unhealable (local); + } // EIO can happen if finding the fresh parent dir failed goto out; } //now No chance for the ia_type to conflict ia_type = sh->buf[sh->success_children[0]].ia_type; + txn_type = afr_transaction_type_get (ia_type); nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, sh->sources, - sh->success_children, - afr_transaction_type_get (ia_type)); + sh->success_children, txn_type, + &subvol_status, _gf_false); if (nsources < 0) { gf_log (this->name, GF_LOG_INFO, "No sources for dir of %s," " in missing entry self-heal, continuing with the rest" " of the self-heals", local->loc.path); - op_errno = EIO; - goto out; + if (subvol_status & SPLIT_BRAIN) { + split_brain = _gf_true; + switch (txn_type) { + case AFR_DATA_TRANSACTION: + nsources = 1; + sh->sources[sh->success_children[0]] = 1; + break; + case AFR_ENTRY_TRANSACTION: + read_child = afr_get_no_xattr_dir_read_child + (this, + sh->success_children, + sh->buf); + sh->sources[read_child] = 1; + nsources = 1; + break; + default: + op_errno = EIO; + goto out; + } + } else { + op_errno = EIO; + goto out; + } } afr_get_fresh_children (sh->success_children, sh->sources, @@ -1281,10 +1511,16 @@ afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this, if (sh->gfid_sh_success_cbk) sh->gfid_sh_success_cbk (frame, this); sh->type = sh->buf[sh->source].ia_type; - sh_missing_entries_create (frame, this); + if (uuid_is_null (loc->inode->gfid)) + uuid_copy (loc->gfid, sh->buf[sh->source].ia_gfid); + if (split_brain) { + afr_sh_missing_entries_finish (frame, this); + } else { + sh_missing_entries_create (frame, this); + } return; out: - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); afr_sh_missing_entries_finish (frame, this); return; @@ -1368,7 +1604,7 @@ afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child, LOCK (&frame->lock); { afr_sh_set_error (sh, EIO); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } UNLOCK (&frame->lock); } @@ -1381,6 +1617,7 @@ afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child, void afr_sh_call_entry_expunge_remove (call_frame_t *frame, xlator_t *this, int child_index, struct iatt *buf, + struct iatt *parentbuf, afr_expunge_done_cbk_t expunge_done) { call_frame_t *expunge_frame = NULL; @@ -1389,13 +1626,14 @@ afr_sh_call_entry_expunge_remove (call_frame_t *frame, xlator_t *this, afr_self_heal_t *sh = NULL; afr_self_heal_t *expunge_sh = NULL; int32_t op_errno = 0; + int ret = 0; expunge_frame = copy_frame (frame); if (!expunge_frame) { goto out; } - ALLOC_OR_GOTO (expunge_local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out); local = frame->local; sh = &local->self_heal; @@ -1403,8 +1641,15 @@ afr_sh_call_entry_expunge_remove (call_frame_t *frame, xlator_t *this, expunge_sh = &expunge_local->self_heal; expunge_sh->sh_frame = frame; loc_copy (&expunge_local->loc, &local->loc); + ret = afr_build_parent_loc (&expunge_sh->parent_loc, + &expunge_local->loc, &op_errno); + if (ret) { + ret = -op_errno; + goto out; + } sh->expunge_done = expunge_done; - afr_sh_entry_expunge_remove (expunge_frame, this, child_index, buf); + afr_sh_entry_expunge_remove (expunge_frame, this, child_index, buf, + parentbuf); return; out: gf_log (this->name, GF_LOG_ERROR, "Expunge of %s failed, reason: %s", @@ -1441,7 +1686,7 @@ afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; priv = this->private; - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { afr_sh_missing_entries_finish (frame, this); } else { if (afr_gfid_missing_count (this->name, sh->fresh_children, @@ -1451,7 +1696,8 @@ afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this) afr_sh_missing_entries_lookup_done, sh->sh_gfid_req, AFR_LOOKUP_FAIL_CONFLICTS| - AFR_LOOKUP_FAIL_MISSING_GFIDS); + AFR_LOOKUP_FAIL_MISSING_GFIDS, + NULL); } else { //No need to set gfid so goto missing entries lookup done //Behave as if you have done the lookup @@ -1533,9 +1779,10 @@ afr_sh_purge_entry_common (call_frame_t *frame, xlator_t *this, if (!purge_condition (local, priv, i)) continue; gf_log (this->name, GF_LOG_INFO, "purging the stale entry %s " - "on %d", local->loc.path, i); + "on %s", local->loc.path, priv->children[i]->name); afr_sh_call_entry_expunge_remove (frame, this, (long) i, &sh->buf[i], + &sh->parentbufs[i], afr_sh_remove_entry_cbk); } out: @@ -1652,10 +1899,8 @@ afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this, sh->child_errno, priv->child_count, ENOENT); if (fresh_child_enoents == fresh_parent_count) { - gf_log (this->name, GF_LOG_INFO, "Deleting stale file %s", - local->loc.path); afr_sh_set_error (sh, ENOENT); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_purge_entry (frame, this); } else if (!afr_conflicting_iattrs (sh->buf, sh->fresh_children, priv->child_count, local->loc.path, @@ -1669,14 +1914,14 @@ afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this, afr_sh_purge_stale_entry (frame, this); } else { op_errno = EIO; - local->govinda_gOvinda = 1; + afr_set_local_for_unhealable (local); goto fail; } return; fail: - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); afr_sh_missing_entries_finish (frame, this); return; @@ -1692,40 +1937,42 @@ afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this, int enoent_count = 0; int nsources = 0; int source = -1; + int32_t subvol_status = 0; local = frame->local; sh = &local->self_heal; priv = this->private; - /* If We can't find a fresh parent directory here, - * we wont know which subvol is correct without finding a parent dir - * upwards which has correct xattrs, for that we may have to - * do lookups till root, we dont wanna do that, - * instead make sure that if there are conflicting gfid - * parent dirs, self-heal thus lookup is failed with EIO. - * if there are missing entries we dont know whether to delete or - * create so fail with EIO, - * If there are conflicting xattr fail with EIO. - */ if (op_ret < 0) goto out; enoent_count = afr_errno_count (NULL, sh->child_errno, priv->child_count, ENOENT); if (enoent_count > 0) { - gf_log (this->name, GF_LOG_ERROR, "Parent dir missing for %s," - " in missing entry self-heal, aborting self-heal", + gf_log (this->name, GF_LOG_INFO, "Parent dir missing for %s," + " in missing entry self-heal, aborting missing-entry " + "self-heal", local->loc.path); - goto out; + afr_sh_missing_entries_finish (frame, this); + return; } nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, sh->sources, sh->success_children, - AFR_ENTRY_TRANSACTION); - if (nsources < 0) { - gf_log (this->name, GF_LOG_ERROR, "No sources for dir of %s," - " in missing entry self-heal, aborting self-heal", - local->loc.path); + AFR_ENTRY_TRANSACTION, &subvol_status, + _gf_true); + if ((subvol_status & ALL_FOOLS) || + (subvol_status & SPLIT_BRAIN)) { + gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative " + "merge", sh->parent_loc.path); + afr_mark_success_children_sources (sh->sources, + sh->success_children, + priv->child_count); + } else if (nsources < 0) { + gf_log (this->name, GF_LOG_ERROR, "No sources for dir " + "of %s, in missing entry self-heal, aborting " + "self-heal", local->loc.path); + op_errno = EIO; goto out; } @@ -1733,18 +1980,20 @@ afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this, if (source == -1) { GF_ASSERT (0); gf_log (this->name, GF_LOG_DEBUG, "No active sources found."); + op_errno = EIO; goto out; } afr_get_fresh_children (sh->success_children, sh->sources, sh->fresh_parent_dirs, priv->child_count); afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_children_lookup_done, NULL, 0); + afr_sh_children_lookup_done, NULL, 0, + NULL); return; out: - afr_sh_set_error (sh, EIO); - sh->op_failed = 1; - afr_sh_missing_entries_finish (frame, this); + afr_sh_set_error (sh, op_errno); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_missing_entries_finish (frame, this); return; } @@ -1772,7 +2021,7 @@ afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count) int afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, afr_lookup_done_cbk_t lookup_done , uuid_t gfid, - int32_t flags) + int32_t flags, dict_t *xdata) { afr_local_t *local = NULL; int i = 0; @@ -1833,7 +2082,8 @@ afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, int -afr_sh_post_nb_entrylk_conflicting_sh_cbk (call_frame_t *frame, xlator_t *this) +afr_sh_post_nb_entrylk_missing_entry_sh_cbk (call_frame_t *frame, + xlator_t *this) { afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; @@ -1846,6 +2096,7 @@ afr_sh_post_nb_entrylk_conflicting_sh_cbk (call_frame_t *frame, xlator_t *this) if (int_lock->lock_op_ret < 0) { gf_log (this->name, GF_LOG_INFO, "Non blocking entrylks failed."); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_missing_entries_done (frame, this); } else { @@ -1853,34 +2104,8 @@ afr_sh_post_nb_entrylk_conflicting_sh_cbk (call_frame_t *frame, xlator_t *this) "Non blocking entrylks done. Proceeding to FOP"); afr_sh_common_lookup (frame, this, &sh->parent_loc, afr_sh_find_fresh_parents, - NULL, AFR_LOOKUP_FAIL_CONFLICTS); - } - - return 0; -} - -int -afr_sh_post_nb_entrylk_gfid_sh_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - int_lock = &local->internal_lock; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "Non blocking entrylks failed."); - afr_sh_missing_entries_done (frame, this); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking entrylks done. Proceeding to FOP"); - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_missing_entries_lookup_done, - sh->sh_gfid_req, AFR_LOOKUP_FAIL_CONFLICTS| - AFR_LOOKUP_FAIL_MISSING_GFIDS); + NULL, AFR_LOOKUP_FAIL_CONFLICTS, + NULL); } return 0; @@ -1892,7 +2117,9 @@ afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, { afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; + afr_private_t *priv = NULL; + priv = this->private; local = frame->local; int_lock = &local->internal_lock; @@ -1904,7 +2131,12 @@ afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, int_lock->lk_basename = base_name; int_lock->lk_loc = loc; int_lock->lock_cbk = lock_cbk; + int_lock->domain = this->name; + int_lock->lockee_count = 0; + afr_init_entry_lockee (&int_lock->lockee[0], local, loc, + base_name, priv->child_count); + int_lock->lockee_count++; afr_nonblocking_entrylk (frame, this); return 0; @@ -1916,6 +2148,9 @@ afr_self_heal_parent_entrylk (call_frame_t *frame, xlator_t *this, { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; + afr_internal_lock_t *int_lock = NULL; + int ret = -1; + int32_t op_errno = 0; local = frame->local; sh = &local->self_heal; @@ -1924,43 +2159,52 @@ afr_self_heal_parent_entrylk (call_frame_t *frame, xlator_t *this, "attempting to recreate missing entries for path=%s", local->loc.path); - GF_ASSERT (local->loc.parent); - afr_build_parent_loc (&sh->parent_loc, &local->loc); + ret = afr_build_parent_loc (&sh->parent_loc, &local->loc, &op_errno); + if (ret) + goto out; afr_sh_entrylk (frame, this, &sh->parent_loc, NULL, lock_cbk); return 0; -} - -static int -afr_self_heal_conflicting_entries (call_frame_t *frame, xlator_t *this) -{ - afr_self_heal_parent_entrylk (frame, this, - afr_sh_post_nb_entrylk_conflicting_sh_cbk); +out: + int_lock = &local->internal_lock; + int_lock->lock_op_ret = -1; + lock_cbk (frame, this); return 0; } static int -afr_self_heal_gfids (call_frame_t *frame, xlator_t *this) +afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) { + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + sh->sh_type_in_action = AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY; + + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); + afr_self_heal_parent_entrylk (frame, this, - afr_sh_post_nb_entrylk_gfid_sh_cbk); + afr_sh_post_nb_entrylk_missing_entry_sh_cbk); return 0; } -afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) +afr_local_t* +afr_self_heal_local_init (afr_local_t *l, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *lc = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *shc = NULL; + afr_private_t *priv = NULL; + afr_local_t *lc = NULL; + afr_self_heal_t *sh = NULL; + afr_self_heal_t *shc = NULL; + int ret = 0; priv = this->private; sh = &l->self_heal; - lc = GF_CALLOC (1, sizeof (afr_local_t), - gf_afr_mt_afr_local_t); + lc = mem_get0 (this->local_pool); if (!lc) goto out; @@ -1973,16 +2217,27 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) shc->do_data_self_heal = sh->do_data_self_heal; shc->do_metadata_self_heal = sh->do_metadata_self_heal; shc->do_entry_self_heal = sh->do_entry_self_heal; + shc->force_confirm_spb = sh->force_confirm_spb; shc->forced_merge = sh->forced_merge; shc->background = sh->background; shc->type = sh->type; + shc->data_sh_info = ""; + shc->metadata_sh_info = ""; uuid_copy (shc->sh_gfid_req, sh->sh_gfid_req); - if (l->loc.path) - loc_copy (&lc->loc, &l->loc); + if (l->loc.path) { + ret = loc_copy (&lc->loc, &l->loc); + if (ret < 0) + goto out; + } lc->child_up = memdup (l->child_up, sizeof (*lc->child_up) * priv->child_count); + if (!lc->child_up) { + ret = -1; + goto out; + } + if (l->xattr_req) lc->xattr_req = dict_ref (l->xattr_req); @@ -1990,40 +2245,25 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode); if (l->cont.lookup.xattr) lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr); - if (l->internal_lock.inode_locked_nodes) - lc->internal_lock.inode_locked_nodes = - memdup (l->internal_lock.inode_locked_nodes, - sizeof (*lc->internal_lock.inode_locked_nodes) * priv->child_count); - else - lc->internal_lock.inode_locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.inode_locked_nodes), - priv->child_count, - gf_afr_mt_char); - if (l->internal_lock.entry_locked_nodes) - lc->internal_lock.entry_locked_nodes = - memdup (l->internal_lock.entry_locked_nodes, - sizeof (*lc->internal_lock.entry_locked_nodes) * priv->child_count); - else - lc->internal_lock.entry_locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.entry_locked_nodes), - priv->child_count, - gf_afr_mt_char); - if (l->internal_lock.locked_nodes) - lc->internal_lock.locked_nodes = - memdup (l->internal_lock.locked_nodes, - sizeof (*lc->internal_lock.locked_nodes) * priv->child_count); - else - lc->internal_lock.locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), - priv->child_count, - gf_afr_mt_char); - lc->internal_lock.inodelk_lock_count = - l->internal_lock.inodelk_lock_count; - lc->internal_lock.entrylk_lock_count = - l->internal_lock.entrylk_lock_count; + lc->internal_lock.locked_nodes = + GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), + priv->child_count, gf_afr_mt_char); + if (!lc->internal_lock.locked_nodes) { + ret = -1; + goto out; + } + + ret = afr_inodelk_init (&lc->internal_lock.inodelk[0], + this->name, priv->child_count); + if (ret) + goto out; out: + if (ret) { + afr_local_cleanup (lc, this); + lc = NULL; + } return lc; } @@ -2033,32 +2273,39 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) afr_private_t * priv = NULL; afr_local_t * local = NULL; afr_self_heal_t * sh = NULL; + afr_local_t * orig_frame_local = NULL; + afr_self_heal_t * orig_frame_sh = NULL; char sh_type_str[256] = {0,}; - gf_boolean_t split_brain = _gf_false; + gf_loglevel_t loglevel = 0; priv = this->private; local = bgsh_frame->local; sh = &local->self_heal; - if (local->govinda_gOvinda) - split_brain = _gf_true; - - afr_set_split_brain (this, sh->inode, split_brain); + if (local->unhealable) { + afr_set_split_brain (this, sh->inode, SPB, SPB); + } afr_self_heal_type_str_get (sh, sh_type_str, sizeof(sh_type_str)); - if (sh->op_failed) { - gf_log (this->name, GF_LOG_ERROR, "background %s self-heal " - "failed on %s", sh_type_str, local->loc.path); + if (is_self_heal_failed (sh, AFR_CHECK_ALL) && !priv->shd.iamshd) { + loglevel = GF_LOG_ERROR; + } else if (!is_self_heal_failed (sh, AFR_CHECK_ALL)) { + loglevel = GF_LOG_INFO; } else { - gf_log (this->name, GF_LOG_INFO, "background %s self-heal " - "completed on %s", sh_type_str, local->loc.path); + loglevel = GF_LOG_DEBUG; } + afr_log_self_heal_completion_status (local, loglevel); + FRAME_SU_UNDO (bgsh_frame, afr_local_t); if (!sh->unwound && sh->unwind) { - sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno); + orig_frame_local = sh->orig_frame->local; + orig_frame_sh = &orig_frame_local->self_heal; + orig_frame_sh->actual_sh_started = _gf_true; + sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, + is_self_heal_failed (sh, AFR_CHECK_ALL)); } if (sh->background) { @@ -2080,13 +2327,12 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; - int i = 0; int32_t op_errno = 0; int ret = 0; afr_self_heal_t *orig_sh = NULL; - - call_frame_t *sh_frame = NULL; - afr_local_t *sh_local = NULL; + call_frame_t *sh_frame = NULL; + afr_local_t *sh_local = NULL; + loc_t *loc = NULL; local = frame->local; orig_sh = &local->self_heal; @@ -2101,21 +2347,20 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) local->self_heal.do_data_self_heal, local->self_heal.do_entry_self_heal); - op_errno = ENOMEM; + op_errno = ENOMEM; sh_frame = copy_frame (frame); if (!sh_frame) goto out; - afr_set_lk_owner (sh_frame, this); + afr_set_lk_owner (sh_frame, this, sh_frame->root); afr_set_low_priority (sh_frame); - sh_local = afr_local_copy (local, this); + sh_local = afr_self_heal_local_init (local, this); if (!sh_local) goto out; sh_frame->local = sh_local; sh = &sh_local->self_heal; sh->inode = inode_ref (inode); - sh->orig_frame = frame; sh->completion_cbk = afr_self_heal_completion_cbk; @@ -2134,30 +2379,16 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) if (!sh->locked_nodes) goto out; - sh->pending_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count, - gf_afr_mt_int32_t); + sh->pending_matrix = afr_matrix_create (priv->child_count, + priv->child_count); if (!sh->pending_matrix) goto out; - for (i = 0; i < priv->child_count; i++) { - sh->pending_matrix[i] = GF_CALLOC (sizeof (int32_t), - priv->child_count, - gf_afr_mt_int32_t); - if (!sh->pending_matrix[i]) - goto out; - } - - sh->delta_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count, - gf_afr_mt_int32_t); + sh->delta_matrix = afr_matrix_create (priv->child_count, + priv->child_count); if (!sh->delta_matrix) goto out; - for (i = 0; i < priv->child_count; i++) { - sh->delta_matrix[i] = GF_CALLOC (sizeof (int32_t), - priv->child_count, - gf_afr_mt_int32_t); - if (!sh->delta_matrix) - goto out; - } + sh->fresh_parent_dirs = afr_children_create (priv->child_count); if (!sh->fresh_parent_dirs) goto out; @@ -2177,18 +2408,30 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) } else { local->self_heal.background = _gf_false; + sh->background = _gf_false; } } UNLOCK (&priv->lock); } + if (!local->loc.parent) { + sh->do_missing_entry_self_heal = _gf_false; + sh->do_gfid_self_heal = _gf_false; + } + + sh->sh_type_in_action = AFR_SELF_HEAL_INVALID; + FRAME_SU_DO (sh_frame, afr_local_t); - if (sh->do_missing_entry_self_heal) { - afr_self_heal_conflicting_entries (sh_frame, this); - } else if (sh->do_gfid_self_heal) { - GF_ASSERT (!uuid_is_null (sh->sh_gfid_req)); - afr_self_heal_gfids (sh_frame, this); + if (sh->do_missing_entry_self_heal || sh->do_gfid_self_heal) { + afr_self_heal_missing_entries (sh_frame, this); } else { + loc = &sh_local->loc; + if (uuid_is_null (loc->inode->gfid) && uuid_is_null (loc->gfid)) { + if (!uuid_is_null (inode->gfid)) + GF_ASSERT (!uuid_compare (inode->gfid, + sh->sh_gfid_req)); + uuid_copy (loc->gfid, sh->sh_gfid_req); + } gf_log (this->name, GF_LOG_TRACE, "proceeding to metadata check on %s", local->loc.path); @@ -2199,7 +2442,9 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) out: if (op_errno) { - orig_sh->unwind (frame, this, -1, op_errno); + orig_sh->unwind (frame, this, -1, op_errno, 1); + if (sh_frame) + AFR_STACK_DESTROY (sh_frame); } return 0; } @@ -2259,10 +2504,18 @@ int afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) { int ret = -1; + uuid_t pargfid = {0}; - if (!child) { + if (!child) + goto out; + + if (!uuid_is_null (parent->inode->gfid)) + uuid_copy (pargfid, parent->inode->gfid); + else if (!uuid_is_null (parent->gfid)) + uuid_copy (pargfid, parent->gfid); + + if (uuid_is_null (pargfid)) goto out; - } if (strcmp (parent->path, "/") == 0) ret = gf_asprintf ((char **)&child->path, "/%s", name); @@ -2275,16 +2528,13 @@ afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) "asprintf failed while setting child path"); } - if (!child->path) { - goto out; - } - child->name = strrchr (child->path, '/'); if (child->name) child->name++; child->parent = inode_ref (parent->inode); child->inode = inode_new (parent->inode->table); + uuid_copy (child->pargfid, pargfid); if (!child->inode) { ret = -1; @@ -2293,8 +2543,270 @@ afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) ret = 0; out: - if (ret == -1) + if ((ret == -1) && child) loc_wipe (child); return ret; } + +int +afr_sh_erase_pending (call_frame_t *frame, xlator_t *this, + afr_transaction_type type, afr_fxattrop_cbk_t cbk, + int (*finish)(call_frame_t *frame, xlator_t *this)) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t **erase_xattr = NULL; + int ret = -1; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, + sh->success, priv->child_count, type); + + erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, + gf_afr_mt_dict_t); + if (!erase_xattr) + goto out; + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) { + call_count++; + erase_xattr[i] = dict_new (); + if (!erase_xattr[i]) + goto out; + } + } + + afr_sh_delta_to_xattr (this, sh->delta_matrix, erase_xattr, + priv->child_count, type); + + gf_log (this->name, GF_LOG_DEBUG, "Delta matrix for: %s", + lkowner_utoa (&frame->root->lk_owner)); + afr_sh_print_pending_matrix (sh->delta_matrix, this); + local->call_count = call_count; + if (call_count == 0) { + ret = 0; + finish (frame, this); + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (!erase_xattr[i]) + continue; + + if (sh->healing_fd) {//true for ENTRY, reg file DATA transaction + STACK_WIND_COOKIE (frame, cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + sh->healing_fd, + GF_XATTROP_ADD_ARRAY, erase_xattr[i], + NULL); + } else { + STACK_WIND_COOKIE (frame, cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, erase_xattr[i], + NULL); + } + } + + ret = 0; +out: + if (erase_xattr) { + for (i = 0; i < priv->child_count; i++) { + if (erase_xattr[i]) { + dict_unref (erase_xattr[i]); + } + } + } + + GF_FREE (erase_xattr); + + if (ret < 0) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + finish (frame, this); + } + + return 0; +} + +void +afr_set_self_heal_status(afr_self_heal_t *sh, afr_self_heal_status status) +{ + xlator_t *this = NULL; + afr_sh_status_for_all_type *sh_status = &(sh->afr_all_sh_status); + afr_self_heal_type sh_type_in_action = sh->sh_type_in_action; + this = THIS; + + if (!sh) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal" + "Structure"); + goto out; + } + + switch (sh_type_in_action) { + case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: + sh_status->gfid_or_missing_entry_self_heal = status; + break; + case AFR_SELF_HEAL_METADATA: + sh_status->metadata_self_heal = status; + break; + case AFR_SELF_HEAL_DATA: + sh_status->data_self_heal = status; + break; + case AFR_SELF_HEAL_ENTRY: + sh_status->entry_self_heal = status; + break; + case AFR_SELF_HEAL_INVALID: + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid" + "self heal type in action"); + break; + } +out: + return; +} + +void +afr_set_local_for_unhealable (afr_local_t *local) +{ + afr_self_heal_t *sh = NULL; + + sh = &local->self_heal; + + local->unhealable = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +} + +int +is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type) +{ + afr_sh_status_for_all_type sh_status = sh->afr_all_sh_status; + afr_self_heal_type sh_type_in_action = AFR_SELF_HEAL_INVALID; + afr_self_heal_status status = AFR_SELF_HEAL_FAILED; + xlator_t *this = NULL; + int sh_failed = 0; + + this = THIS; + + if (!sh) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal " + "structure"); + sh_failed = 1; + goto out; + } + + if (type == AFR_CHECK_ALL) { + if ((sh_status.gfid_or_missing_entry_self_heal == AFR_SELF_HEAL_FAILED) + || (sh_status.metadata_self_heal == AFR_SELF_HEAL_FAILED) + || (sh_status.data_self_heal == AFR_SELF_HEAL_FAILED) + || (sh_status.entry_self_heal == AFR_SELF_HEAL_FAILED)) + sh_failed = 1; + } else if (type == AFR_CHECK_SPECIFIC) { + sh_type_in_action = sh->sh_type_in_action; + switch (sh_type_in_action) { + case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: + status = sh_status.gfid_or_missing_entry_self_heal; + break; + case AFR_SELF_HEAL_METADATA: + status = sh_status.metadata_self_heal; + break; + case AFR_SELF_HEAL_ENTRY: + status = sh_status.entry_self_heal; + break; + case AFR_SELF_HEAL_DATA: + status = sh_status.data_self_heal; + break; + case AFR_SELF_HEAL_INVALID: + status = AFR_SELF_HEAL_NOT_ATTEMPTED; + break; + } + if (status == AFR_SELF_HEAL_FAILED) + sh_failed = 1; + + } + +out: + return sh_failed; +} + +char * +get_sh_completion_status (afr_self_heal_status status) +{ + + char *not_attempted = " is not attempted"; + char *failed = " failed"; + char *started = " is started"; + char *sync_begin = " is successfully completed"; + char *result = " has unknown status"; + + switch (status) + { + case AFR_SELF_HEAL_NOT_ATTEMPTED: + result = not_attempted; + break; + case AFR_SELF_HEAL_FAILED: + result = failed; + break; + case AFR_SELF_HEAL_STARTED: + result = started; + break; + case AFR_SELF_HEAL_SYNC_BEGIN: + result = sync_begin; + break; + } + + return result; + +} + +void +afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t loglvl) +{ + + char sh_log[4096] = {0}; + afr_self_heal_t *sh = &local->self_heal; + afr_sh_status_for_all_type all_status = sh->afr_all_sh_status; + xlator_t *this = NULL; + size_t off = 0; + int data_sh = 0; + int metadata_sh = 0; + int print_log = 0; + + this = THIS; + + ADD_FMT_STRING (sh_log, off, "gfid or missing entry", + all_status.gfid_or_missing_entry_self_heal, print_log); + ADD_FMT_STRING_SYNC (sh_log, off, "metadata", + all_status.metadata_self_heal, print_log); + if (sh->background) { + ADD_FMT_STRING_SYNC (sh_log, off, "backgroung data", + all_status.data_self_heal, print_log); + } else { + ADD_FMT_STRING_SYNC (sh_log, off, "foreground data", + all_status.data_self_heal, print_log); + } + ADD_FMT_STRING_SYNC (sh_log, off, "entry", all_status.entry_self_heal, + print_log); + + if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.data_self_heal && + strcmp (sh->data_sh_info, "") && sh->data_sh_info ) + data_sh = 1; + if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.metadata_self_heal && + strcmp (sh->metadata_sh_info, "") && sh->metadata_sh_info) + metadata_sh = 1; + + if (!print_log) + return; + + gf_log (this->name, loglvl, "%s %s %s on %s", sh_log, + ((data_sh == 1) ? sh->data_sh_info : ""), + ((metadata_sh == 1) ? sh->metadata_sh_info : ""), + local->loc.path); +} diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h index bc0dcd78c..473264776 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ b/xlators/cluster/afr/src/afr-self-heal-common.h @@ -1,33 +1,18 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __AFR_SELF_HEAL_COMMON_H__ #define __AFR_SELF_HEAL_COMMON_H__ #define FILE_HAS_HOLES(buf) (((buf)->ia_size) > ((buf)->ia_blocks * 512)) - -typedef enum { - AFR_SELF_HEAL_ENTRY, - AFR_SELF_HEAL_METADATA, - AFR_SELF_HEAL_DATA, - AFR_SELF_HEAL_INVALID = -1, -} afr_self_heal_type; +#define AFR_SH_MIN_PARTICIPANTS 2 typedef enum { AFR_LOOKUP_FAIL_CONFLICTS = 1, @@ -38,16 +23,18 @@ int afr_sh_select_source (int sources[], int child_count); int -afr_sh_sink_count (int sources[], int child_count); - -int afr_sh_source_count (int sources[], int child_count); void afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this); +void +afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, + const char *loc); + int afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, + unsigned char *ignorant_subvols, dict_t *xattr[], afr_transaction_type type, size_t child_count); @@ -57,18 +44,15 @@ afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, int child_count, afr_transaction_type type); int -afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, - int32_t child_count, afr_self_heal_type type, - int32_t *valid_children, const char *xlator_name); +afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, + struct iatt *bufs, afr_self_heal_type type, + int32_t *success_children, int32_t *subvol_status); int -afr_sh_delta_to_xattr (afr_private_t *priv, +afr_sh_delta_to_xattr (xlator_t *this, int32_t *delta_matrix[], dict_t *xattr[], int child_count, afr_transaction_type type); -int -afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count); - void afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, size_t size); @@ -77,9 +61,10 @@ afr_self_heal_type afr_self_heal_type_for_transaction (afr_transaction_type type); int -afr_build_sources (xlator_t *xlator, dict_t **xattr, struct iatt *bufs, +afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, int32_t **pending_matrix, int32_t *sources, - int32_t *success_children, afr_transaction_type type); + int32_t *success_children, afr_transaction_type type, + int32_t *subvol_status, gf_boolean_t ignore_ignorant); void afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count); @@ -94,25 +79,25 @@ afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, int afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, afr_lookup_done_cbk_t lookup_cbk, uuid_t uuid, - int32_t flags); + int32_t flags, dict_t *xdata); int afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, - int active_src, struct iatt *buf); + int active_src, struct iatt *buf, + struct iatt *parentbuf); int afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, char *base_name, afr_lock_cbk_t lock_cbk); int afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *buf, - struct iatt *postparent); + int child_index); int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, afr_lock_cbk_t lock_cbk); afr_local_t * -afr_local_copy (afr_local_t *l, xlator_t *this); +afr_self_heal_local_init (afr_local_t *l, xlator_t *this); int afr_sh_data_lock (call_frame_t *frame, xlator_t *this, - off_t start, off_t len, + off_t start, off_t len, gf_boolean_t block, char *dom, afr_lock_cbk_t success_handler, afr_lock_cbk_t failure_handler); void @@ -122,11 +107,38 @@ afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this); typedef int (*afr_fxattrop_cbk_t) (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr); + dict_t *xattr, dict_t *xdata); int afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name); int afr_impunge_frame_create (call_frame_t *frame, xlator_t *this, - int active_source, int ret_child, mode_t entry_mode, - call_frame_t **impunge_frame); + int active_source, call_frame_t **impunge_frame); +void +afr_sh_reset (call_frame_t *frame, xlator_t *this); + +void +afr_children_intersection_get (int32_t *set1, int32_t *set2, + int *intersection, unsigned int child_count); +int +afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children, + struct iatt *bufs); +int +afr_sh_erase_pending (call_frame_t *frame, xlator_t *this, + afr_transaction_type type, afr_fxattrop_cbk_t cbk, + int (*finish)(call_frame_t *frame, xlator_t *this)); + +void +afr_set_local_for_unhealable (afr_local_t *local); + +int +is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type); + +void +afr_set_self_heal_status (afr_self_heal_t *sh, afr_self_heal_status status); + +void +afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t logl); + +char* +afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this); #endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 97d578c9b..9de26ee56 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -49,6 +40,14 @@ #include "afr-self-heal-common.h" #include "afr-self-heal-algorithm.h" +int +afr_sh_data_fail (call_frame_t *frame, xlator_t *this); + +static inline gf_boolean_t +afr_sh_data_proceed (unsigned int success_count) +{ + return (success_count >= AFR_SH_MIN_PARTICIPANTS); +} extern int sh_loop_finish (call_frame_t *loop_frame, xlator_t *this); @@ -63,15 +62,6 @@ int afr_sh_data_finish (call_frame_t *frame, xlator_t *this); int -afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this, - afr_fxattrop_cbk_t fxattrop_cbk); - -int -afr_post_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr); - -int afr_sh_data_done (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; @@ -88,7 +78,7 @@ afr_sh_data_done (call_frame_t *frame, xlator_t *this) int afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; afr_private_t *priv = NULL; @@ -101,7 +91,7 @@ afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, LOCK (&frame->lock); { if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_ERROR, "flush failed on %s on subvolume %s: %s", local->loc.path, priv->children[child_index]->name, strerror (op_errno)); @@ -131,6 +121,11 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; priv = this->private; + if (!sh->healing_fd) { + //This happens when file is non-reg + afr_sh_data_done (frame, this); + return 0; + } call_count = afr_set_elem_count_get (sh->success, priv->child_count); local->call_count = call_count; @@ -151,7 +146,7 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->flush, - sh->healing_fd); + sh->healing_fd, NULL); if (!--call_count) break; @@ -161,9 +156,28 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this) } int +afr_sh_dom_unlock (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + if (sh->sh_dom_lock_held) + afr_sh_data_unlock (frame, this, priv->sh_domain, + afr_sh_data_close); + else + afr_sh_data_close (frame, this); + return 0; +} + +int afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *statpre, - struct iatt *statpost) + struct iatt *statpost, dict_t *xdata) { afr_local_t *local = NULL; @@ -195,29 +209,20 @@ afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } int -afr_sh_data_setattr (call_frame_t *frame, xlator_t *this) +afr_sh_data_setattr (call_frame_t *frame, xlator_t *this, struct iatt* stbuf) { afr_local_t *local = NULL; afr_private_t *priv = NULL; afr_self_heal_t *sh = NULL; int i = 0; int call_count = 0; - int source = 0; int32_t valid = 0; - struct iatt stbuf = {0,}; local = frame->local; sh = &local->self_heal; priv = this->private; - source = sh->source; - - valid |= (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME); - - stbuf.ia_atime = sh->buf[source].ia_atime; - stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec; - stbuf.ia_mtime = sh->buf[source].ia_mtime; - stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec; + valid = (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME); call_count = afr_set_elem_count_get (sh->success, priv->child_count); @@ -237,7 +242,7 @@ afr_sh_data_setattr (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->setattr, - &local->loc, &stbuf, valid); + &local->loc, stbuf, valid, NULL); if (!--call_count) break; @@ -249,7 +254,7 @@ afr_sh_data_setattr (call_frame_t *frame, xlator_t *this) int afr_sh_data_setattr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf) + struct iatt *buf, dict_t *xdata) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; @@ -259,9 +264,14 @@ afr_sh_data_setattr_fstat_cbk (call_frame_t *frame, void *cookie, sh = &local->self_heal; GF_ASSERT (sh->source == child_index); - if (op_ret != -1) + if (op_ret != -1) { sh->buf[child_index] = *buf; - afr_sh_data_setattr (frame, this); + afr_sh_data_setattr (frame, this, buf); + } else { + gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " + "time-stamps after self-heal", local->loc.path); + afr_sh_data_fail (frame, this); + } return 0; } @@ -286,37 +296,51 @@ afr_sh_set_timestamps (call_frame_t *frame, xlator_t *this) (void *) (long) sh->source, priv->children[sh->source], priv->children[sh->source]->fops->fstat, - sh->healing_fd); + sh->healing_fd, NULL); return 0; } //Fun fact, lock_cbk is being used for both lock & unlock int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, afr_lock_cbk_t lock_cbk) { afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int ret = 0; local = frame->local; int_lock = &local->internal_lock; sh = &local->self_heal; + priv = this->private; - GF_ASSERT (sh->data_lock_held); - - sh->data_lock_held = _gf_false; + if (strcmp (dom, this->name) == 0) { + sh->data_lock_held = _gf_false; + } else if (strcmp (dom, priv->sh_domain) == 0) { + sh->sh_dom_lock_held = _gf_false; + } else { + ret = -1; + goto out; + } int_lock->lock_cbk = lock_cbk; + int_lock->domain = dom; afr_unlock (frame, this); +out: + if (ret) { + int_lock->lock_op_ret = -1; + int_lock->lock_cbk (frame, this); + } return 0; } int afr_sh_data_finish (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; local = frame->local; sh = &local->self_heal; @@ -325,9 +349,9 @@ afr_sh_data_finish (call_frame_t *frame, xlator_t *this) "finishing data selfheal of %s", local->loc.path); if (sh->data_lock_held) - afr_sh_data_unlock (frame, this, afr_sh_data_close); + afr_sh_data_unlock (frame, this, this->name, afr_sh_dom_unlock); else - afr_sh_data_close (frame, this); + afr_sh_dom_unlock (frame, this); return 0; } @@ -344,40 +368,49 @@ afr_sh_data_fail (call_frame_t *frame, xlator_t *this) gf_log (this->name, GF_LOG_DEBUG, "finishing failed data selfheal of %s", local->loc.path); - sh->op_failed = 1; - if (sh->data_lock_held) - afr_sh_data_unlock (frame, this, afr_sh_data_close); - else - afr_sh_data_close (frame, this); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_data_finish (frame, this); return 0; } int afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) + int32_t op_errno, dict_t *xattr, dict_t *xdata) { int call_count = 0; afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int32_t child_index = (long) cookie; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Erasing of pending change " + "log failed on %s for subvol %s, reason: %s", + local->loc.path, priv->children[child_index]->name, + strerror (op_errno)); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + } call_count = afr_frame_return (frame); if (call_count == 0) { - local = frame->local; - sh = &local->self_heal; - if (!IA_ISREG (sh->type)) { - afr_sh_data_finish (frame, this); + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { + if (sh->old_loop_frame) + sh_loop_finish (sh->old_loop_frame, this); + sh->old_loop_frame = NULL; + afr_sh_data_fail (frame, this); goto out; } - if (NULL == sh->old_loop_frame) { - GF_ASSERT (sh->data_lock_held); - afr_sh_data_fxattrop (frame, this, - afr_post_sh_data_fxattrop_cbk); + if (!IA_ISREG (sh->type)) { + afr_sh_data_finish (frame, this); goto out; } - - afr_sh_data_lock (frame, this, 0, 0, + GF_ASSERT (sh->old_loop_frame); + afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, afr_post_sh_big_lock_success, afr_post_sh_big_lock_failure); } @@ -388,74 +421,95 @@ out: int afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; + afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION, + afr_sh_data_erase_pending_cbk, + afr_sh_data_finish); + return 0; +} + +int +afr_sh_data_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = (long) cookie; local = frame->local; - sh = &local->self_heal; priv = this->private; + sh = &local->self_heal; + + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "%s: Failed to fsync on " + "%s - %s", local->loc.path, + priv->children[child_index]->name, strerror (op_errno)); + LOCK (&frame->lock); + { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + } + UNLOCK (&frame->lock); + if (sh->old_loop_frame) + sh_loop_finish (sh->old_loop_frame, this); + sh->old_loop_frame = NULL; + } - afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, sh->success, - priv->child_count, AFR_DATA_TRANSACTION); - gf_log (this->name, GF_LOG_DEBUG, "Delta matrix for: %"PRIu64, - frame->root->lk_owner); - afr_sh_print_pending_matrix (sh->delta_matrix, this); + call_count = afr_frame_return (frame); + if (call_count == 0) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) + afr_sh_data_fail (frame, this); + else + afr_sh_data_erase_pending (frame, this); + } + return 0; +} - erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, - gf_afr_mt_dict_t); +/* + * Before erasing xattrs, make sure the data is written to disk + */ +int +afr_sh_data_fsync (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + int i = 0; + int call_count = 0; - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; + local = frame->local; + priv = this->private; + sh = &local->self_heal; - erase_xattr[i] = get_new_dict(); - dict_ref (erase_xattr[i]); - } + call_count = sh->active_sinks; + if (call_count == 0) { + afr_sh_data_erase_pending (frame, this); + return 0; } - afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, - priv->child_count, AFR_DATA_TRANSACTION); - - GF_ASSERT (call_count); local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) + if (!sh->success[i] || sh->sources[i]) continue; - gf_log (this->name, GF_LOG_DEBUG, - "erasing pending flags from %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_erase_pending_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - sh->healing_fd, - GF_XATTROP_ADD_ARRAY, erase_xattr[i]); - if (!--call_count) - break; + STACK_WIND_COOKIE (frame, afr_sh_data_fsync_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->fsync, + sh->healing_fd, 1, NULL); } - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); - } - } - GF_FREE (erase_xattr); - return 0; } - static struct afr_sh_algorithm * sh_algo_from_name (xlator_t *this, char *name) { int i = 0; + if (name == NULL) + goto out; + while (afr_self_heal_algorithms[i].name) { if (!strcmp (name, afr_self_heal_algorithms[i].name)) { return &afr_self_heal_algorithms[i]; @@ -464,17 +518,22 @@ sh_algo_from_name (xlator_t *this, char *name) i++; } +out: return NULL; } static int -sh_zero_byte_files_exist (afr_self_heal_t *sh, int child_count) +sh_zero_byte_files_exist (afr_local_t *local, int child_count) { - int i; - int ret = 0; + int i = 0; + int ret = 0; + afr_self_heal_t *sh = NULL; + sh = &local->self_heal; for (i = 0; i < child_count; i++) { + if (!local->child_up[i] || sh->child_errno[i]) + continue; if (sh->buf[i].ia_size == 0) { ret = 1; break; @@ -501,8 +560,7 @@ afr_sh_data_pick_algo (call_frame_t *frame, xlator_t *this) if (algo == NULL) { /* option not set, so fall back on heuristics */ - if ((local->enoent_count != 0) - || sh_zero_byte_files_exist (sh, priv->child_count) + if (sh_zero_byte_files_exist (local, priv->child_count) || (sh->file_size <= (priv->data_self_heal_window_size * this->ctx->page_size))) { @@ -540,11 +598,12 @@ afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this) local = frame->local; sh = &local->self_heal; - sh->algo_completion_cbk = afr_sh_data_erase_pending; + sh->algo_completion_cbk = afr_sh_data_fsync; sh->algo_abort_cbk = afr_sh_data_fail; sh_algo = afr_sh_data_pick_algo (frame, this); + sh->algo = sh_algo; sh_algo->fn (frame, this); return 0; @@ -553,38 +612,46 @@ afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this) int afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; int call_count = 0; int child_index = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; - priv = this->private; + priv = this->private; local = frame->local; + sh = &local->self_heal; child_index = (long) cookie; LOCK (&frame->lock); { - if (op_ret == -1) - gf_log (this->name, GF_LOG_INFO, + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "ftruncate of %s on subvolume %s failed (%s)", local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - else + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + } else { gf_log (this->name, GF_LOG_DEBUG, "ftruncate of %s on subvolume %s completed", local->loc.path, priv->children[child_index]->name); + } } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); - if (call_count == 0) - afr_sh_data_sync_prepare (frame, this); + if (call_count == 0) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) + afr_sh_data_fail (frame, this); + else + afr_sh_data_sync_prepare (frame, this); + } return 0; } @@ -618,7 +685,8 @@ afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->ftruncate, - sh->healing_fd, sh->file_size); + sh->healing_fd, sh->file_size, + NULL); if (!--call_count) break; @@ -632,6 +700,7 @@ afr_sh_inode_set_read_ctx (afr_self_heal_t *sh, xlator_t *this) { afr_private_t *priv = NULL; int ret = 0; + int i = 0; priv = this->private; sh->source = afr_sh_select_source (sh->sources, priv->child_count); @@ -640,6 +709,15 @@ afr_sh_inode_set_read_ctx (afr_self_heal_t *sh, xlator_t *this) goto out; } + /* detect changes not visible through pending flags -- JIC */ + for (i = 0; i < priv->child_count; i++) { + if (i == sh->source || sh->child_errno[i]) + continue; + + if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[sh->source])) + sh->sources[i] = 0; + } + afr_reset_children (sh->fresh_children, priv->child_count); afr_get_fresh_children (sh->success_children, sh->sources, sh->fresh_children, priv->child_count); @@ -649,72 +727,211 @@ out: return ret; } -int -afr_sh_data_fix (call_frame_t *frame, xlator_t *this) +char* +afr_get_sizes_str (afr_local_t *local, struct iatt *bufs, xlator_t *this) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int nsources = 0; - int source = 0; - int i = 0; - int ret = 0; + afr_private_t *priv = NULL; + int i = 0; + char num[1024] = {0}; + size_t len = 0; + char *sizes_str = NULL; + size_t off = 0; + char *fmt_str = "%llu bytes on %s, "; + char *child_down = " %s,"; + char *child_unknown = " %s,"; + int down_child_present = 0; + int down_count = 0; + int unknown_count = 0; + int unknown_child_present = 0; + char *down_subvol_1 = " down subvolume is "; + char *unknown_subvol_1 = " unknown subvolume is "; + char *down_subvol_2 = " down subvolumes are "; + char *unknown_subvol_2 = " unknown subvolumes are "; - local = frame->local; - sh = &local->self_heal; priv = this->private; - gf_log (this->name, GF_LOG_DEBUG, "Pending matrix for: %"PRIu64, - frame->root->lk_owner); - nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, - sh->sources, sh->success_children, - AFR_DATA_TRANSACTION); - if (nsources == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "No self-heal needed for %s", - local->loc.path); + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 1) { + len += snprintf (num, sizeof (num), fmt_str, + (unsigned long long) bufs[i].ia_size, + priv->children[i]->name); + } else if (local->child_up[i] == 0) { + len += snprintf (num, sizeof (num), child_down, + priv->children[i]->name); + if (!down_child_present) + down_child_present = 1; + down_count ++; + } else if (local->child_up[i] == -1) { + len += snprintf (num, sizeof (num), child_unknown, + priv->children[i]->name); + if (!unknown_child_present) + unknown_child_present = 1; + unknown_count++; + } - afr_sh_data_finish (frame, this); - return 0; } - if ((nsources == -1) - && (priv->favorite_child != -1) - && (sh->child_errno[priv->favorite_child] == 0)) { + if (down_child_present) { + if (down_count > 1) + len += snprintf (num, sizeof (num), "%s", + down_subvol_2); + else + len += snprintf (num, sizeof (num), "%s", + down_subvol_1); + } + if (unknown_child_present) { + if (unknown_count > 1) + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_2); + else + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_1); + } - gf_log (this->name, GF_LOG_DEBUG, - "Picking favorite child %s as authentic source to " - "resolve conflicting data of %s", - priv->children[priv->favorite_child]->name, - local->loc.path); + len++;//for '\0' - sh->sources[priv->favorite_child] = 1; + sizes_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); - nsources = afr_sh_source_count (sh->sources, - priv->child_count); + if (!sizes_str) + return NULL; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 1) { + off += snprintf (sizes_str + off, len - off, fmt_str, + (unsigned long long) bufs[i].ia_size, + priv->children[i]->name); + } } - if (nsources == -1) { - gf_log (this->name, GF_LOG_ERROR, - "Unable to self-heal contents of '%s' (possible " - "split-brain). Please delete the file from all but " - "the preferred subvolume.", local->loc.path); + if (down_child_present) { + if (down_count > 1) { + off += snprintf (sizes_str + off, len - off, "%s", + down_subvol_2); + } else { + off += snprintf (sizes_str + off, len - off, "%s", + down_subvol_1); + } + } - local->govinda_gOvinda = 1; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 0) { + off += snprintf (sizes_str + off, len - off, child_down, + priv->children[i]->name); + } + } - afr_sh_data_fail (frame, this); - return 0; + if (unknown_child_present) { + if (unknown_count > 1) { + off += snprintf (sizes_str + off, len - off, "%s", + unknown_subvol_2); + } else { + off += snprintf (sizes_str + off, len - off, "%s", + unknown_subvol_1); + } } - ret = afr_sh_inode_set_read_ctx (sh, this); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "No active sources found."); + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == -1) { + off += snprintf (sizes_str + off, len - off, + child_unknown, + priv->children[i]->name); - afr_sh_data_fail (frame, this); - return 0; + } } + return sizes_str; +} + +char* +afr_get_sinks_str (xlator_t *this, afr_local_t *local, afr_self_heal_t *sh) +{ + afr_private_t *priv = NULL; + int i = 0; + char num[1024] = {0}; + size_t len = 0; + char *sinks_str = NULL; + char *temp_str = " to sinks "; + char *str_format = " %s,"; + char off = 0; + + priv = this->private; + + len += snprintf (num, sizeof (num), "%s", temp_str); + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { + len += snprintf (num, sizeof (num), str_format, + priv->children[i]->name); + } + } + + len ++; + + sinks_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); + + if (!sinks_str) + return NULL; + + off += snprintf (sinks_str + off, len - off, "%s", temp_str); + + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { + off += snprintf (sinks_str + off, len - off, + str_format, + priv->children[i]->name); + } + } + + return sinks_str; + +} + + +void +afr_set_data_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, xlator_t *this) +{ + char *pending_matrix_str = NULL; + char *sizes_str = NULL; + char *sinks_str = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + + pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix, + this); + if (!pending_matrix_str) + pending_matrix_str = ""; + + sizes_str = afr_get_sizes_str (local, sh->buf, this); + if (!sizes_str) + sizes_str = ""; + + sinks_str = afr_get_sinks_str (this, local, sh); + if (!sinks_str) + sinks_str = ""; + + gf_asprintf (&sh->data_sh_info, " data self heal from %s %s with " + "%s data %s", priv->children[sh->source]->name, sinks_str, + sizes_str, pending_matrix_str); + + if (pending_matrix_str && strcmp (pending_matrix_str, "")) + GF_FREE (pending_matrix_str); + + if (sizes_str && strcmp (sizes_str, "")) + GF_FREE (sizes_str); +} + +void +afr_sh_data_fix (call_frame_t *frame, xlator_t *this) +{ + int source = 0; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + source = sh->source; sh->block_size = this->ctx->page_size; sh->file_size = sh->buf[source].ia_size; @@ -722,17 +939,9 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this) if (FILE_HAS_HOLES (&sh->buf[source])) sh->file_has_holes = 1; - /* detect changes not visible through pending flags -- JIC */ - for (i = 0; i < priv->child_count; i++) { - if (i == source || sh->child_errno[i]) - continue; - - if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[source])) - sh->sources[i] = 0; - } - - if (sh->background && sh->unwind) { - sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno); + if (sh->background && sh->unwind && !sh->unwound) { + sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, + is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)); sh->unwound = _gf_true; } @@ -742,70 +951,123 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this) "no active sinks for performing self-heal on file %s", local->loc.path); afr_sh_data_finish (frame, this); - return 0; + return; } gf_log (this->name, GF_LOG_DEBUG, "self-healing file %s from subvolume %s to %d other", local->loc.path, priv->children[sh->source]->name, sh->active_sinks); - afr_sh_data_trim_sinks (frame, this); - return 0; + sh->actual_sh_started = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); + afr_sh_data_trim_sinks (frame, this); } -static void -afr_destroy_pending_matrix (int32_t **pending_matrix, int32_t child_count) +int +afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this) { + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int nsources = 0; + int ret = 0; + int *old_sources = NULL; + int tstamp_source = 0; int i = 0; - GF_ASSERT (child_count > 0); - if (pending_matrix) { - for (i = 0; i < child_count; i++) { - if (pending_matrix[i]) - GF_FREE (pending_matrix[i]); - } - GF_FREE (pending_matrix); + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + gf_log (this->name, GF_LOG_DEBUG, "Pending matrix for: %s", + lkowner_utoa (&frame->root->lk_owner)); + if (sh->sync_done) { + //store sources before sync so that mtime can be set using the + //iatt buf from one of them. + old_sources = alloca (priv->child_count*sizeof (*old_sources)); + memcpy (old_sources, sh->sources, + priv->child_count * sizeof (*old_sources)); } -} -static int32_t** -afr_create_pending_matrix (int32_t child_count) -{ - gf_boolean_t cleanup = _gf_false; - int32_t **pending_matrix = NULL; - int i = 0; + nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, + sh->sources, sh->success_children, + AFR_DATA_TRANSACTION, NULL, _gf_true); + if ((nsources == -1) + && (priv->favorite_child != -1) + && (sh->child_errno[priv->favorite_child] == 0)) { - GF_ASSERT (child_count > 0); + gf_log (this->name, GF_LOG_DEBUG, + "Picking favorite child %s as authentic source to " + "resolve conflicting data of %s", + priv->children[priv->favorite_child]->name, + local->loc.path); - pending_matrix = GF_CALLOC (sizeof (*pending_matrix), child_count, - gf_afr_mt_int32_t); - if (NULL == pending_matrix) - goto out; - for (i = 0; i < child_count; i++) { - pending_matrix[i] = GF_CALLOC (sizeof (**pending_matrix), - child_count, - gf_afr_mt_int32_t); - if (NULL == pending_matrix[i]) { - cleanup = _gf_true; - goto out; - } + sh->sources[priv->favorite_child] = 1; + + nsources = afr_sh_source_count (sh->sources, + priv->child_count); } -out: - if (_gf_true == cleanup) { - afr_destroy_pending_matrix (pending_matrix, child_count); - pending_matrix = NULL; + + if (nsources == -1) { + afr_sh_print_split_brain_log (sh->pending_matrix, this, + local->loc.path); + afr_set_split_brain (this, sh->inode, DONT_KNOW, SPB); + + afr_sh_data_fail (frame, this); + return 0; + } + + afr_set_split_brain (this, sh->inode, DONT_KNOW, NO_SPB); + + ret = afr_sh_inode_set_read_ctx (sh, this); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "No active sources found."); + + afr_sh_data_fail (frame, this); + return 0; } - return pending_matrix; + + if (sh->sync_done) { + /* Perform setattr from one of the old_sources if possible + * Because only they have the correct mtime, the new sources + * (i.e. old sinks) have mtime from last writev in sync. + */ + tstamp_source = sh->source; + for (i = 0; i < priv->child_count; i++) { + if (old_sources[i] && sh->sources[i]) + tstamp_source = i; + } + afr_sh_data_setattr (frame, this, &sh->buf[tstamp_source]); + } else { + afr_set_data_sh_info_str (local, sh, this); + if (nsources == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "No self-heal needed for %s", + local->loc.path); + + afr_sh_data_finish (frame, this); + return 0; + } + + if (sh->do_data_self_heal && + afr_data_self_heal_enabled (priv->data_self_heal)) + afr_sh_data_fix (frame, this); + else + afr_sh_data_finish (frame, this); + } + return 0; } int afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, dict_t **xattr, - afr_transaction_type txn_type) + afr_transaction_type txn_type, + uuid_t gfid) { afr_private_t *priv = NULL; int read_child = -1; - int ret = -1; int32_t **pending_matrix = NULL; int32_t *sources = NULL; int32_t *success_children = NULL; @@ -813,26 +1075,41 @@ afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, int32_t nsources = 0; int32_t prev_read_child = -1; int32_t config_read_child = -1; + int32_t subvol_status = 0; priv = this->private; bufs = local->cont.lookup.bufs; success_children = local->cont.lookup.success_children; - pending_matrix = afr_create_pending_matrix (priv->child_count); - if (NULL == pending_matrix) - goto out; - - sources = GF_CALLOC (sizeof (*sources), priv->child_count, - gf_afr_mt_int32_t); - if (NULL == sources) - goto out; + pending_matrix = local->cont.lookup.pending_matrix; + sources = local->cont.lookup.sources; + memset (sources, 0, sizeof (*sources) * priv->child_count); nsources = afr_build_sources (this, xattr, bufs, pending_matrix, - sources, success_children, txn_type); - if (nsources < 0) { - ret = -1; - goto out; + sources, success_children, txn_type, + &subvol_status, _gf_false); + if (subvol_status & SPLIT_BRAIN) { + gf_log (this->name, GF_LOG_DEBUG, "%s: Possible split-brain", + local->loc.path); + switch (txn_type) { + case AFR_DATA_TRANSACTION: + local->cont.lookup.possible_spb = _gf_true; + nsources = 1; + sources[success_children[0]] = 1; + break; + case AFR_ENTRY_TRANSACTION: + read_child = afr_get_no_xattr_dir_read_child (this, + success_children, + bufs); + sources[read_child] = 1; + nsources = 1; + break; + default: + break; + } } + if (nsources < 0) + goto out; prev_read_child = local->read_child_index; config_read_child = priv->read_child; @@ -840,44 +1117,18 @@ afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, priv->child_count, prev_read_child, config_read_child, - sources); - ret = 0; - local->cont.lookup.sources = sources; + sources, + priv->hash_mode, gfid); out: - afr_destroy_pending_matrix (pending_matrix, priv->child_count); - if (-1 == ret) { - if (sources) - GF_FREE (sources); - } - gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d", read_child); + gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d", + read_child); return read_child; } int -afr_sh_data_special_file_fix (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - for (i = 0; i < priv->child_count ; i++) - if (1 == local->child_up[i]) - sh->success[i] = 1; - - afr_sh_data_erase_pending (frame, this); - - return 0; -} - -int afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf) + struct iatt *buf, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -900,6 +1151,12 @@ afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie, sh->buf[child_index] = *buf; sh->success_children[sh->success_count] = child_index; sh->success_count++; + } else { + gf_log (this->name, GF_LOG_ERROR, "%s: fstat failed " + "on %s, reason %s", local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + sh->child_errno[child_index] = op_errno; } } UNLOCK (&frame->lock); @@ -910,13 +1167,17 @@ afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie, /* Previous versions of glusterfs might have set * the pending data xattrs which need to be erased */ - if (IA_ISREG (buf->ia_type)) - afr_sh_data_fix (frame, this); - else - afr_sh_data_special_file_fix (frame, this); - + if (!afr_sh_data_proceed (sh->success_count)) { + gf_log (this->name, GF_LOG_ERROR, "inspecting metadata " + "succeeded on < %d children, aborting " + "self-heal for %s", AFR_SH_MIN_PARTICIPANTS, + local->loc.path); + afr_sh_data_fail (frame, this); + goto out; + } + afr_sh_data_fxattrop_fstat_done (frame, this); } - +out: return 0; } @@ -927,33 +1188,41 @@ afr_sh_data_fstat (call_frame_t *frame, xlator_t *this) afr_self_heal_t *sh = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; + int call_count = 0; + int i = 0; + int child = 0; + int32_t *fstat_children = NULL; priv = this->private; local = frame->local; sh = &local->self_heal; - call_count = afr_up_children_count (local->child_up, - priv->child_count); - + fstat_children = memdup (sh->success_children, + sizeof (*fstat_children) * priv->child_count); + if (!fstat_children) { + afr_sh_data_fail (frame, this); + goto out; + } + call_count = sh->success_count; local->call_count = call_count; + memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count); afr_reset_children (sh->success_children, priv->child_count); sh->success_count = 0; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fstat, - sh->healing_fd); - - if (!--call_count) - break; - } + child = fstat_children[i]; + if (child == -1) + break; + STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk, + (void *) (long) child, + priv->children[child], + priv->children[child]->fops->fstat, + sh->healing_fd, NULL); + --call_count; } - + GF_ASSERT (!call_count); +out: + GF_FREE (fstat_children); return 0; } @@ -982,73 +1251,60 @@ afr_sh_common_fxattrop_resp_handler (call_frame_t *frame, void *cookie, sh->xattr[child_index] = dict_ref (xattr); sh->success_children[sh->success_count] = child_index; sh->success_count++; + } else { + gf_log (this->name, GF_LOG_ERROR, "fxattrop of %s " + "failed on %s, reason %s", local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + sh->child_errno[child_index] = op_errno; } } UNLOCK (&frame->lock); } int -afr_post_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) +afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xattr, dict_t *xdata) { int call_count = -1; - int ret = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - afr_sh_common_fxattrop_resp_handler (frame, cookie, this, op_ret, - op_errno, xattr); + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; local = frame->local; - sh = &local->self_heal; - call_count = afr_frame_return (frame); - if (call_count == 0) { - (void) afr_build_sources (this, sh->xattr, NULL, - sh->pending_matrix, - sh->sources, sh->success_children, - AFR_DATA_TRANSACTION); - ret = afr_sh_inode_set_read_ctx (sh, this); - if (ret) - afr_sh_data_fail (frame, this); - else - afr_sh_set_timestamps (frame, this); - } - - return 0; -} - -int -afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) -{ - int call_count = -1; + sh = &local->self_heal; afr_sh_common_fxattrop_resp_handler (frame, cookie, this, op_ret, op_errno, xattr); call_count = afr_frame_return (frame); if (call_count == 0) { + if (!afr_sh_data_proceed (sh->success_count)) { + gf_log (this->name, GF_LOG_ERROR, "%s, inspecting " + "change log succeeded on < %d children", + local->loc.path, AFR_SH_MIN_PARTICIPANTS); + afr_sh_data_fail (frame, this); + goto out; + } afr_sh_data_fstat (frame, this); } - +out: return 0; } int -afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this, - afr_fxattrop_cbk_t fxattrop_cbk) +afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this) { afr_self_heal_t *sh = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; - dict_t *xattr_req = NULL; + dict_t **xattr_req; int32_t *zero_pending = NULL; int call_count = 0; int i = 0; int ret = 0; + int j; priv = this->private; local = frame->local; @@ -1059,42 +1315,53 @@ afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this, local->call_count = call_count; - xattr_req = dict_new(); - if (!xattr_req) { - ret = -1; - goto out; - } - - for (i = 0; i < priv->child_count; i++) { - zero_pending = GF_CALLOC (3, sizeof (*zero_pending), - gf_afr_mt_int32_t); - if (!zero_pending) { - ret = -1; - goto out; - } - ret = dict_set_dynptr (xattr_req, priv->pending_key[i], - zero_pending, - 3 * sizeof (*zero_pending)); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value"); - goto out; - } else { - zero_pending = NULL; - } - } + xattr_req = GF_CALLOC(priv->child_count, sizeof(struct dict_t *), + gf_afr_mt_dict_t); + if (!xattr_req) + goto out; + + for (i = 0; i < priv->child_count; i++) { + xattr_req[i] = dict_new(); + if (!xattr_req[i]) { + ret = -1; + goto out; + } + } + + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + zero_pending = GF_CALLOC (3, sizeof (*zero_pending), + gf_afr_mt_int32_t); + if (!zero_pending) { + ret = -1; + goto out; + } + ret = dict_set_dynptr (xattr_req[i], priv->pending_key[j], + zero_pending, + 3 * sizeof (*zero_pending)); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "Unable to set dict value"); + goto out; + } else { + zero_pending = NULL; + } + } + } afr_reset_xattr (sh->xattr, priv->child_count); afr_reset_children (sh->success_children, priv->child_count); + memset (sh->child_errno, 0, + sizeof (*sh->child_errno) * priv->child_count); sh->success_count = 0; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, fxattrop_cbk, + STACK_WIND_COOKIE (frame, afr_sh_data_fxattrop_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->fxattrop, sh->healing_fd, GF_XATTROP_ADD_ARRAY, - xattr_req); + xattr_req[i], NULL); if (!--call_count) break; @@ -1102,14 +1369,16 @@ afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this, } out: - if (xattr_req) - dict_unref (xattr_req); + if (xattr_req) { + for (i = 0; i < priv->child_count; i++) + if (xattr_req[i]) + dict_unref(xattr_req[i]); + GF_FREE(xattr_req); + } if (ret) { - if (zero_pending) - GF_FREE (zero_pending); - sh->op_failed = 1; - afr_sh_data_done (frame, this); + GF_FREE (zero_pending); + afr_sh_data_fail (frame, this); } return 0; @@ -1125,7 +1394,23 @@ afr_sh_data_big_lock_success (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; sh->data_lock_held = _gf_true; - afr_sh_data_fxattrop (frame, this, afr_sh_data_fxattrop_cbk); + afr_sh_data_fxattrop (frame, this); + return 0; +} + +int +afr_sh_dom_lock_success (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + sh->sh_dom_lock_held = _gf_true; + afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, + afr_sh_data_big_lock_success, + afr_sh_data_fail); return 0; } @@ -1142,14 +1427,16 @@ afr_sh_data_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) if (int_lock->lock_op_ret < 0) { gf_log (this->name, GF_LOG_ERROR, "Blocking data inodelks " - "failed for %s. by %"PRIu64, - local->loc.path, frame->root->lk_owner); + "failed for %s. by %s", + local->loc.path, lkowner_utoa (&frame->root->lk_owner)); + sh->data_lock_failure_handler (frame, this); } else { gf_log (this->name, GF_LOG_DEBUG, "Blocking data inodelks " - "done for %s by %"PRIu64". Proceding to self-heal", - local->loc.path, frame->root->lk_owner); + "done for %s by %s. Proceding to self-heal", + local->loc.path, lkowner_utoa (&frame->root->lk_owner)); + sh->data_lock_success_handler (frame, this); } @@ -1169,15 +1456,21 @@ afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) if (int_lock->lock_op_ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks " - "failed for %s. by %"PRIu64, - local->loc.path, frame->root->lk_owner); - int_lock->lock_cbk = afr_sh_data_post_blocking_inodelk_cbk; - afr_blocking_lock (frame, this); + "failed for %s. by %s", + local->loc.path, lkowner_utoa (&frame->root->lk_owner)); + + if (!sh->data_lock_block) { + sh->data_lock_failure_handler(frame, this); + } else { + int_lock->lock_cbk = + afr_sh_data_post_blocking_inodelk_cbk; + afr_blocking_lock (frame, this); + } } else { gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks " - "done for %s by %"PRIu64". Proceeding to self-heal", - local->loc.path, frame->root->lk_owner); + "done for %s by %s. Proceeding to self-heal", + local->loc.path, lkowner_utoa (&frame->root->lk_owner)); sh->data_lock_success_handler (frame, this); } @@ -1185,9 +1478,11 @@ afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) } int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, off_t start, off_t len) +afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, char *dom, + off_t start, off_t len) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; local = frame->local; @@ -1198,11 +1493,14 @@ afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, off_t start, off_t le afr_set_lock_number (frame, this); - int_lock->lk_flock.l_start = start; - int_lock->lk_flock.l_len = len; - int_lock->lk_flock.l_type = F_WRLCK; int_lock->lock_cbk = afr_sh_data_post_nonblocking_inodelk_cbk; + int_lock->domain = dom; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + inodelk->flock.l_start = start; + inodelk->flock.l_len = len; + inodelk->flock.l_type = F_WRLCK; + afr_nonblocking_inodelk (frame, this); return 0; @@ -1221,7 +1519,8 @@ afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this) sh_loop_finish (sh->old_loop_frame, this); sh->old_loop_frame = NULL; sh->data_lock_held = _gf_true; - afr_sh_data_fxattrop (frame, this, afr_post_sh_data_fxattrop_cbk); + sh->sync_done = _gf_true; + afr_sh_data_fxattrop (frame, this); return 0; } @@ -1244,8 +1543,8 @@ afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this) int afr_sh_data_lock (call_frame_t *frame, xlator_t *this, - off_t start, off_t len, - afr_lock_cbk_t success_handler, + off_t start, off_t len, gf_boolean_t block, + char *dom, afr_lock_cbk_t success_handler, afr_lock_cbk_t failure_handler) { afr_local_t * local = NULL; @@ -1256,12 +1555,13 @@ afr_sh_data_lock (call_frame_t *frame, xlator_t *this, sh->data_lock_success_handler = success_handler; sh->data_lock_failure_handler = failure_handler; - return afr_sh_data_lock_rec (frame, this, start, len); + sh->data_lock_block = block; + return afr_sh_data_lock_rec (frame, this, dom, start, len); } int afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; @@ -1287,20 +1587,20 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + } else { + gf_log (this->name, GF_LOG_TRACE, + "open of %s succeeded on child %s", + local->loc.path, + priv->children[child_index]->name); } - - gf_log (this->name, GF_LOG_TRACE, - "open of %s succeeded on child %s", - local->loc.path, - priv->children[child_index]->name); } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) { - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { afr_sh_data_fail (frame, this); return 0; } @@ -1309,9 +1609,8 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "fd for %s opened, commencing sync", local->loc.path); - afr_sh_data_lock (frame, this, 0, 0, - afr_sh_data_big_lock_success, - afr_sh_data_fail); + afr_sh_data_lock (frame, this, 0, 0, _gf_true, priv->sh_domain, + afr_sh_dom_lock_success, afr_sh_data_fail); } return 0; @@ -1348,7 +1647,7 @@ afr_sh_data_open (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->open, &local->loc, - O_RDWR|O_LARGEFILE, fd, 0); + O_RDWR|O_LARGEFILE, fd, NULL); if (!--call_count) break; @@ -1357,20 +1656,93 @@ afr_sh_data_open (call_frame_t *frame, xlator_t *this) return 0; } +void +afr_sh_non_reg_fix (call_frame_t *frame, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + afr_local_t *local = NULL; + int i = 0; + + if (op_ret < 0) { + afr_sh_data_fail (frame, this); + return; + } + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + for (i = 0; i < priv->child_count ; i++) { + if (1 == local->child_up[i]) + sh->success[i] = 1; + } + + afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION, + afr_sh_data_erase_pending_cbk, + afr_sh_data_finish); +} int -afr_self_heal_data (call_frame_t *frame, xlator_t *this) +afr_sh_non_reg_lock_success (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; - afr_private_t *priv = this->private; local = frame->local; sh = &local->self_heal; + sh->data_lock_held = _gf_true; + afr_sh_common_lookup (frame, this, &local->loc, + afr_sh_non_reg_fix, NULL, + AFR_LOOKUP_FAIL_CONFLICTS | + AFR_LOOKUP_FAIL_MISSING_GFIDS, + NULL); + return 0; +} +gf_boolean_t +afr_can_start_data_self_heal (afr_self_heal_t *sh, afr_private_t *priv) +{ + if (sh->force_confirm_spb) + return _gf_true; if (sh->do_data_self_heal && - afr_data_self_heal_enabled (priv->data_self_heal)) { - afr_sh_data_open (frame, this); + afr_data_self_heal_enabled (priv->data_self_heal)) + return _gf_true; + return _gf_false; +} + +int +afr_self_heal_data (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = this->private; + int ret = -1; + + local = frame->local; + sh = &local->self_heal; + + sh->sh_type_in_action = AFR_SELF_HEAL_DATA; + + if (afr_can_start_data_self_heal (sh, priv)) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); + ret = afr_inodelk_init (&local->internal_lock.inodelk[1], + priv->sh_domain, priv->child_count); + if (ret < 0) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_data_done (frame, this); + return 0; + } + + if (IA_ISREG (sh->type)) { + afr_sh_data_open (frame, this); + } else { + afr_sh_data_lock (frame, this, 0, 0, _gf_true, + this->name, + afr_sh_non_reg_lock_success, + afr_sh_data_fail); + } } else { gf_log (this->name, GF_LOG_TRACE, "not doing data self heal on %s", diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 12cba1f49..53491a1d7 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -59,6 +50,9 @@ } while (0); int +afr_sh_entry_impunge_create_file (call_frame_t *impunge_frame, xlator_t *this, + int child_index); +int afr_sh_entry_done (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; @@ -67,10 +61,6 @@ afr_sh_entry_done (call_frame_t *frame, xlator_t *this) local = frame->local; sh = &local->self_heal; - if (sh->healing_fd) - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; - sh->completion_cbk (frame, this); return 0; @@ -112,7 +102,7 @@ afr_sh_entry_finish (call_frame_t *frame, xlator_t *this) int afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) + int32_t op_errno, dict_t *xattr, dict_t *xdata) { long i = 0; int call_count = 0; @@ -167,66 +157,20 @@ afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; - int need_unwind = 0; local = frame->local; sh = &local->self_heal; - priv = this->private; - - afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, sh->success, - priv->child_count, AFR_ENTRY_TRANSACTION); - erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, - gf_afr_mt_dict_t); - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; - - erase_xattr[i] = get_new_dict(); - dict_ref (erase_xattr[i]); - } - } - - if (call_count == 0) - need_unwind = 1; - - afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, - priv->child_count, AFR_ENTRY_TRANSACTION); - - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "erasing pending flags from %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_entry_erase_pending_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, erase_xattr[i]); - if (!--call_count) - break; - } - - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); - } + if (sh->entries_skipped) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + goto out; } - GF_FREE (erase_xattr); - - if (need_unwind) - afr_sh_entry_finish (frame, this); - + afr_sh_erase_pending (frame, this, AFR_ENTRY_TRANSACTION, + afr_sh_entry_erase_pending_cbk, + afr_sh_entry_finish); + return 0; +out: + afr_sh_entry_finish (frame, this); return 0; } @@ -311,8 +255,7 @@ int afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this); int -afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src); +afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this); int afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this); @@ -340,7 +283,8 @@ int afr_sh_entry_expunge_parent_setattr_cbk (call_frame_t *expunge_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) + struct iatt *preop, struct iatt *postop, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *expunge_local = NULL; @@ -376,7 +320,7 @@ afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *expunge_local = NULL; @@ -404,7 +348,6 @@ afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie, } valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - afr_build_parent_loc (&expunge_sh->parent_loc, &expunge_local->loc); STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_parent_setattr_cbk, (void *) (long) active_src, @@ -412,7 +355,7 @@ afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie, priv->children[active_src]->fops->setattr, &expunge_sh->parent_loc, &expunge_sh->parentbuf, - valid); + valid, NULL); return 0; } @@ -436,7 +379,7 @@ afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this, (void *) (long) active_src, priv->children[active_src], priv->children[active_src]->fops->unlink, - &expunge_local->loc); + &expunge_local->loc, 0, NULL); return 0; } @@ -461,7 +404,7 @@ afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this, (void *) (long) active_src, priv->children[active_src], priv->children[active_src]->fops->rmdir, - &expunge_local->loc, 1); + &expunge_local->loc, 1, NULL); return 0; } @@ -469,7 +412,8 @@ afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this, int afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, - int active_src, struct iatt *buf) + int active_src, struct iatt *buf, + struct iatt *parentbuf) { afr_private_t *priv = NULL; afr_local_t *expunge_local = NULL; @@ -478,6 +422,7 @@ afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, int type = 0; afr_self_heal_t *sh = NULL; afr_local_t *local = NULL; + loc_t *loc = NULL; priv = this->private; expunge_local = expunge_frame->local; @@ -485,8 +430,11 @@ afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, frame = expunge_sh->sh_frame; local = frame->local; sh = &local->self_heal; + loc = &expunge_local->loc; type = buf->ia_type; + if (loc->parent && uuid_is_null (loc->parent->gfid)) + uuid_copy (loc->pargfid, parentbuf->ia_gfid); switch (type) { case IA_IFSOCK: @@ -550,7 +498,8 @@ afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie, goto out; } - afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf); + afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf, + postparent); return 0; out: @@ -579,7 +528,7 @@ afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this, (void *) (long) active_src, priv->children[active_src], priv->children[active_src]->fops->lookup, - &expunge_local->loc, 0); + &expunge_local->loc, NULL); return 0; } @@ -631,7 +580,8 @@ afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie, if (need_expunge) { gf_log (this->name, GF_LOG_INFO, - "missing entry %s on %s", + "Entry %s is missing on %s and deleting from " + "replica's other bricks", expunge_local->loc.path, priv->children[source]->name); @@ -663,6 +613,19 @@ out: return 0; } +static gf_boolean_t +can_skip_entry_self_heal (char *name, loc_t *parent_loc) +{ + if (strcmp (name, ".") == 0) { + return _gf_true; + } else if (strcmp (name, "..") == 0) { + return _gf_true; + } else if (loc_is_root (parent_loc) && + (strcmp (name, GF_REPLICATE_TRASH_DIR) == 0)) { + return _gf_true; + } + return _gf_false; +} int afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, @@ -690,21 +653,13 @@ afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, sh->expunge_done = afr_sh_entry_expunge_entry_done; name = entry->d_name; - - if ((strcmp (name, ".") == 0) - || (strcmp (name, "..") == 0) - || ((strcmp (local->loc.path, "/") == 0) - && (strcmp (name, GF_REPLICATE_TRASH_DIR) == 0))) { - - gf_log (this->name, GF_LOG_TRACE, - "skipping inspection of %s under %s", - name, local->loc.path); + if (can_skip_entry_self_heal (name, &local->loc)) { op_ret = 0; goto out; } gf_log (this->name, GF_LOG_TRACE, - "inspecting existance of %s under %s", + "inspecting existence of %s under %s", name, local->loc.path); expunge_frame = copy_frame (frame); @@ -713,15 +668,17 @@ afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, goto out; } - ALLOC_OR_GOTO (expunge_local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out); expunge_frame->local = expunge_local; expunge_sh = &expunge_local->self_heal; expunge_sh->sh_frame = frame; expunge_sh->active_source = active_src; expunge_sh->entrybuf = entry->d_stat; + loc_copy (&expunge_sh->parent_loc, &local->loc); - ret = afr_build_child_loc (this, &expunge_local->loc, &local->loc, name); + ret = afr_build_child_loc (this, &expunge_local->loc, &local->loc, + name); if (ret != 0) { op_errno = EINVAL; goto out; @@ -736,7 +693,7 @@ afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, (void *) (long) source, priv->children[source], priv->children[source]->fops->lookup, - &expunge_local->loc, 0); + &expunge_local->loc, NULL); ret = 0; out: @@ -751,7 +708,7 @@ int afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) + gf_dirent_t *entries, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -819,7 +776,7 @@ afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk, priv->children[active_src], priv->children[active_src]->fops->readdirp, - sh->healing_fd, sh->block_size, sh->offset); + sh->healing_fd, sh->block_size, sh->offset, NULL); return 0; } @@ -849,7 +806,7 @@ afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this) active_src = next_active_sink (frame, this, sh->active_source); sh->active_source = active_src; - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { goto out; } @@ -874,15 +831,19 @@ out: int afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this, - int active_src, int32_t op_ret, - int32_t op_errno) + int32_t op_ret, int32_t op_errno) { int call_count = 0; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + local = frame->local; + sh = &local->self_heal; + if (op_ret < 0) + sh->entries_skipped = _gf_true; call_count = afr_frame_return (frame); - if (call_count == 0) - afr_sh_entry_impunge_subvol (frame, this, active_src); + afr_sh_entry_impunge_subvol (frame, this); return 0; } @@ -896,22 +857,20 @@ afr_sh_entry_call_impunge_done (call_frame_t *impunge_frame, xlator_t *this, afr_self_heal_t *sh = NULL; afr_self_heal_t *impunge_sh = NULL; call_frame_t *frame = NULL; - int32_t impunge_ret_child = 0; AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, frame, local, sh); - impunge_ret_child = impunge_sh->impunge_ret_child; AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, impunge_ret_child, op_ret, - op_errno); + sh->impunge_done (frame, this, op_ret, op_errno); } int afr_sh_entry_impunge_setattr_cbk (call_frame_t *impunge_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) + struct iatt *preop, struct iatt *postop, + dict_t *xdata) { int call_count = 0; afr_private_t *priv = NULL; @@ -923,7 +882,7 @@ afr_sh_entry_impunge_setattr_cbk (call_frame_t *impunge_frame, void *cookie, child_index = (long) cookie; if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, + gf_log (this->name, GF_LOG_DEBUG, "setattr done for %s on %s", impunge_local->loc.path, priv->children[child_index]->name); @@ -935,31 +894,114 @@ afr_sh_entry_impunge_setattr_cbk (call_frame_t *impunge_frame, void *cookie, strerror (op_errno)); } - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; + call_count = afr_frame_return (impunge_frame); + if (call_count == 0) { + afr_sh_entry_call_impunge_done (impunge_frame, this, + 0, op_errno); } - UNLOCK (&impunge_frame->lock); - if (call_count == 0) - afr_sh_entry_call_impunge_done (impunge_frame, this, - op_ret, op_errno); + return 0; +} + +int +afr_sh_entry_impunge_parent_setattr_cbk (call_frame_t *setattr_frame, + void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, + dict_t *xdata) +{ + int call_count = 0; + afr_local_t *setattr_local = NULL; + + setattr_local = setattr_frame->local; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_INFO, + "setattr on parent directory (%s) failed: %s", + setattr_local->loc.path, strerror (op_errno)); + } + call_count = afr_frame_return (setattr_frame); + if (call_count == 0) + AFR_STACK_DESTROY (setattr_frame); return 0; } +int +afr_sh_entry_impunge_setattr (call_frame_t *impunge_frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_local_t *setattr_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + call_frame_t *setattr_frame = NULL; + int32_t valid = 0; + int32_t op_errno = 0; + int child_index = 0; + int call_count = 0; + int i = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + + gf_log (this->name, GF_LOG_DEBUG, + "setting ownership of %s on %s to %d/%d", + impunge_local->loc.path, + priv->children[child_index]->name, + impunge_sh->entrybuf.ia_uid, + impunge_sh->entrybuf.ia_gid); + + setattr_frame = copy_frame (impunge_frame); + if (!setattr_frame) { + op_errno = ENOMEM; + goto out; + } + AFR_LOCAL_ALLOC_OR_GOTO (setattr_frame->local, out); + setattr_local = setattr_frame->local; + call_count = afr_errno_count (NULL, impunge_sh->child_errno, + priv->child_count, 0); + loc_copy (&setattr_local->loc, &impunge_sh->parent_loc); + impunge_local->call_count = call_count; + setattr_local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (impunge_sh->child_errno[i]) + continue; + valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; + STACK_WIND_COOKIE (setattr_frame, + afr_sh_entry_impunge_parent_setattr_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->setattr, + &setattr_local->loc, + &impunge_sh->parentbuf, valid, NULL); + + valid = GF_SET_ATTR_UID | GF_SET_ATTR_GID | + GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; + STACK_WIND_COOKIE (impunge_frame, + afr_sh_entry_impunge_setattr_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->setattr, + &impunge_local->loc, + &impunge_sh->entrybuf, valid, NULL); + call_count--; + } + GF_ASSERT (!call_count); + return 0; +out: + if (setattr_frame) + AFR_STACK_DESTROY (setattr_frame); + afr_sh_entry_call_impunge_done (impunge_frame, this, 0, op_errno); + return 0; +} int afr_sh_entry_impunge_xattrop_cbk (call_frame_t *impunge_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) + dict_t *xattr, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *impunge_local = NULL; int child_index = 0; - struct iatt stbuf = {0}; - int32_t valid = 0; priv = this->private; impunge_local = impunge_frame->local; @@ -972,55 +1014,59 @@ afr_sh_entry_impunge_xattrop_cbk (call_frame_t *impunge_frame, void *cookie, impunge_local->loc.path, priv->children[child_index]->name, strerror (op_errno)); + goto out; } - gf_log (this->name, GF_LOG_TRACE, - "setting ownership of %s on %s to %d/%d", - impunge_local->loc.path, - priv->children[child_index]->name, - impunge_local->cont.lookup.buf.ia_uid, - impunge_local->cont.lookup.buf.ia_gid); - - stbuf.ia_atime = impunge_local->cont.lookup.buf.ia_atime; - stbuf.ia_atime_nsec = impunge_local->cont.lookup.buf.ia_atime_nsec; - stbuf.ia_mtime = impunge_local->cont.lookup.buf.ia_mtime; - stbuf.ia_mtime_nsec = impunge_local->cont.lookup.buf.ia_mtime_nsec; - - stbuf.ia_uid = impunge_local->cont.lookup.buf.ia_uid; - stbuf.ia_gid = impunge_local->cont.lookup.buf.ia_gid; - - valid = GF_SET_ATTR_UID | GF_SET_ATTR_GID | - GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_setattr_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->setattr, - &impunge_local->loc, - &stbuf, valid); + afr_sh_entry_impunge_setattr (impunge_frame, this); + return 0; +out: + afr_sh_entry_call_impunge_done (impunge_frame, this, + -1, op_errno); return 0; } - int -afr_sh_entry_impunge_parent_setattr_cbk (call_frame_t *setattr_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) +afr_sh_entry_impunge_perform_xattrop (call_frame_t *impunge_frame, + xlator_t *this) { - loc_t *parent_loc = cookie; + int active_src = 0; + dict_t *xattr = NULL; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int32_t op_errno = 0; - if (op_ret != 0) { - gf_log (this->name, GF_LOG_INFO, - "setattr on parent directory (%s) failed: %s", - parent_loc->path, strerror (op_errno)); + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + active_src = impunge_sh->active_source; + + afr_prepare_new_entry_pending_matrix (impunge_local->pending, + afr_is_errno_unset, + impunge_sh->child_errno, + &impunge_sh->entrybuf, + priv->child_count); + xattr = dict_new (); + if (!xattr) { + op_errno = ENOMEM; + goto out; } - loc_wipe (parent_loc); + afr_set_pending_dict (priv, xattr, impunge_local->pending, active_src, + LOCAL_LAST); - GF_FREE (parent_loc); + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_xattrop_cbk, + (void *) (long) active_src, + priv->children[active_src], + priv->children[active_src]->fops->xattrop, + &impunge_local->loc, GF_XATTROP_ADD_ARRAY, xattr, NULL); - AFR_STACK_DESTROY (setattr_frame); + if (xattr) + dict_unref (xattr); + return 0; +out: + afr_sh_entry_call_impunge_done (impunge_frame, this, + -1, op_errno); return 0; } @@ -1030,124 +1076,165 @@ afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { int call_count = 0; afr_private_t *priv = NULL; afr_local_t *impunge_local = NULL; afr_self_heal_t *impunge_sh = NULL; - int active_src = 0; int child_index = 0; - int32_t *pending_array = NULL; - dict_t *xattr = NULL; - int ret = 0; - int idx = 0; - call_frame_t *setattr_frame = NULL; - int32_t valid = 0; - loc_t *parent_loc = NULL; - struct iatt parentbuf = {0,}; priv = this->private; impunge_local = impunge_frame->local; impunge_sh = &impunge_local->self_heal; - active_src = impunge_sh->active_source; child_index = (long) cookie; if (op_ret == -1) { - ret = -1; + impunge_sh->child_errno[child_index] = op_errno; gf_log (this->name, GF_LOG_ERROR, "creation of %s on %s failed (%s)", impunge_local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - goto out; + } else { + impunge_sh->child_errno[child_index] = 0; } - inode->ia_type = stbuf->ia_type; - - xattr = dict_new (); - if (!xattr) { - ret = -1; - goto out; + call_count = afr_frame_return (impunge_frame); + if (call_count == 0) { + if (!afr_errno_count (NULL, impunge_sh->child_errno, + priv->child_count, 0)) { + // new_file creation failed every where + afr_sh_entry_call_impunge_done (impunge_frame, this, + -1, op_errno); + goto out; + } + afr_sh_entry_impunge_perform_xattrop (impunge_frame, this); } +out: + return 0; +} - pending_array = (int32_t*) GF_CALLOC (3, sizeof (*pending_array), - gf_afr_mt_int32_t); +int +afr_sh_entry_impunge_hardlink_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int call_count = 0; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; - if (!pending_array) { - ret = -1; - goto out; - } + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; - /* Pending data xattrs shouldn't be set for special files - */ - idx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); - pending_array[idx] = hton32 (1); - if (IA_ISDIR (stbuf->ia_type)) - idx = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); - else if (IA_ISREG (stbuf->ia_type)) - idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - else - goto cont; - pending_array[idx] = hton32 (1); - -cont: - ret = dict_set_dynptr (xattr, priv->pending_key[child_index], - pending_array, - 3 * sizeof (*pending_array)); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); - } else { - pending_array = NULL; + if (IA_IFLNK == impunge_sh->entrybuf.ia_type) { + //For symlinks impunge is attempted un-conditionally + //So the file can already exist. + if ((op_ret < 0) && (op_errno == EEXIST)) + op_ret = 0; } - valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - parentbuf = impunge_sh->parentbuf; - setattr_frame = copy_frame (impunge_frame); - parent_loc = GF_CALLOC (1, sizeof (*parent_loc), - gf_afr_mt_loc_t); - if (!parent_loc) { - ret = -1; - goto out; - } - afr_build_parent_loc (parent_loc, &impunge_local->loc); + call_count = afr_frame_return (impunge_frame); + if (call_count == 0) + afr_sh_entry_call_impunge_done (impunge_frame, this, + op_ret, op_errno); - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_xattrop_cbk, - (void *) (long) child_index, - priv->children[active_src], - priv->children[active_src]->fops->xattrop, - &impunge_local->loc, GF_XATTROP_ADD_ARRAY, xattr); + return 0; +} - STACK_WIND_COOKIE (setattr_frame, afr_sh_entry_impunge_parent_setattr_cbk, - (void *) (long) parent_loc, - priv->children[child_index], - priv->children[child_index]->fops->setattr, - parent_loc, &parentbuf, valid); +int +afr_sh_entry_impunge_hardlink (call_frame_t *impunge_frame, xlator_t *this, + int child_index) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + loc_t *loc = NULL; + struct iatt *buf = NULL; + loc_t oldloc = {0}; -out: - if (xattr) - dict_unref (xattr); + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + loc = &impunge_local->loc; + buf = &impunge_sh->entrybuf; - if (ret) { - if (pending_array) - GF_FREE (pending_array); + oldloc.inode = inode_ref (loc->inode); + uuid_copy (oldloc.gfid, buf->ia_gfid); + gf_log (this->name, GF_LOG_DEBUG, "linking missing file %s on %s", + loc->path, priv->children[child_index]->name); - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_hardlink_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->link, + &oldloc, loc, NULL); + loc_wipe (&oldloc); - if (call_count == 0) - afr_sh_entry_call_impunge_done (impunge_frame, this, - -1, op_errno); - } + return 0; +} +int +afr_sh_nameless_lookup_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xattr, + struct iatt *postparent) +{ + if (op_ret < 0) { + afr_sh_entry_impunge_create_file (impunge_frame, this, + (long)cookie); + } else { + afr_sh_entry_impunge_hardlink (impunge_frame, this, + (long)cookie); + } return 0; } +int +afr_sh_entry_impunge_check_hardlink (call_frame_t *impunge_frame, + xlator_t *this, + int child_index, struct iatt *stbuf) +{ + afr_private_t *priv = NULL; + call_frame_t *frame = NULL; + afr_local_t *impunge_local = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *impunge_sh = NULL; + afr_self_heal_t *sh = NULL; + loc_t *loc = NULL; + dict_t *xattr_req = NULL; + loc_t oldloc = {0}; + int ret = -1; + + priv = this->private; + AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, + frame, local, sh); + loc = &impunge_local->loc; + + xattr_req = dict_new (); + if (!xattr_req) + goto out; + oldloc.inode = inode_ref (loc->inode); + uuid_copy (oldloc.gfid, stbuf->ia_gfid); + + STACK_WIND_COOKIE (impunge_frame, afr_sh_nameless_lookup_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->lookup, + &oldloc, xattr_req); + ret = 0; +out: + if (xattr_req) + dict_unref (xattr_req); + loc_wipe (&oldloc); + if (ret) + sh->impunge_done (frame, this, -1, ENOMEM); + return 0; +} int afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, @@ -1176,6 +1263,35 @@ afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed", impunge_local->loc.path); + /* + * Reason for adding GLUSTERFS_INTERNAL_FOP_KEY : + * + * Problem: + * While a brick is down in a replica pair, lets say the user creates + * one file(file-A) and a hard link to that file(h-file-A). After the + * brick comes back up, entry self-heal is attempted on parent dir of + * these two files. As part of readdir in self-heal it reads both the + * entries file-A and h-file-A for both of them it does name less lookup + * to check if there are any hardlinks already present in the + * destination brick. It finds that there are no hard links already + * present for files file-A, h-file-A. Self-heal does mknods for both + * file-A and h-file-A. This leads to file-A and h-file-A not being + * hardlinks anymore. + * + * Fix: (More like shrinking of race-window, the race itself is still + * present in posix-mknod). + * If mknod comes with the presence of GLUSTERFS_INTERNAL_FOP_KEY then + * posix_mknod checks if there are already any gfid-links and does + * link() instead of mknod. There still can be a race where two + * posix_mknods same gfid see that + * gfid-link file is not present and proceeds with mknods and result in + * two different files with same gfid. + */ + ret = dict_set_str (dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); + if (ret) + gf_log (this->name, GF_LOG_INFO, "%s: %s set failed", + impunge_local->loc.path, GLUSTERFS_INTERNAL_FOP_KEY); + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, (void *) (long) child_index, priv->children[child_index], @@ -1183,7 +1299,7 @@ afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, &impunge_local->loc, st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type), makedev (ia_major (stbuf->ia_rdev), - ia_minor (stbuf->ia_rdev)), dict); + ia_minor (stbuf->ia_rdev)), 0, dict); if (dict) dict_unref (dict); @@ -1230,7 +1346,7 @@ afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this, priv->children[child_index]->fops->mkdir, &impunge_local->loc, st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type), - dict); + 0, dict); if (dict) dict_unref (dict); @@ -1252,7 +1368,7 @@ afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this, priv = this->private; impunge_local = impunge_frame->local; - buf = &impunge_local->cont.symlink.buf; + buf = &impunge_local->cont.dir_fop.buf; dict = dict_new (); if (!dict) { @@ -1277,7 +1393,7 @@ afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this, (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->symlink, - linkname, &impunge_local->loc, dict); + linkname, &impunge_local->loc, 0, dict); if (dict) dict_unref (dict); @@ -1291,7 +1407,7 @@ afr_sh_entry_impunge_symlink_unlink_cbk (call_frame_t *impunge_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *impunge_local = NULL; @@ -1352,7 +1468,7 @@ afr_sh_entry_impunge_symlink_unlink (call_frame_t *impunge_frame, xlator_t *this (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->unlink, - &impunge_local->loc); + &impunge_local->loc, 0, NULL); return 0; } @@ -1362,7 +1478,7 @@ int afr_sh_entry_impunge_readlink_sink_cbk (call_frame_t *impunge_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - const char *linkname, struct iatt *sbuf) + const char *linkname, struct iatt *sbuf, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *impunge_local = NULL; @@ -1444,7 +1560,7 @@ afr_sh_entry_impunge_readlink_sink (call_frame_t *impunge_frame, xlator_t *this, (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->readlink, - &impunge_local->loc, 4096); + &impunge_local->loc, 4096, NULL); return 0; } @@ -1454,7 +1570,7 @@ int afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - const char *linkname, struct iatt *sbuf) + const char *linkname, struct iatt *sbuf, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *impunge_local = NULL; @@ -1512,36 +1628,84 @@ afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this, impunge_local = impunge_frame->local; impunge_sh = &impunge_local->self_heal; active_src = impunge_sh->active_source; - impunge_local->cont.symlink.buf = *stbuf; + impunge_local->cont.dir_fop.buf = *stbuf; STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk, (void *) (long) child_index, priv->children[active_src], priv->children[active_src]->fops->readlink, - &impunge_local->loc, 4096); + &impunge_local->loc, 4096, NULL); return 0; } int afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *buf, - struct iatt *postparent) + int child_index) { - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; + call_frame_t *frame = NULL; + afr_local_t *impunge_local = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *impunge_sh = NULL; + afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; ia_type_t type = IA_INVAL; - int ret = 0; int active_src = 0; + struct iatt *buf = NULL; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - impunge_sh->parentbuf = *postparent; + AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, + frame, local, sh); active_src = impunge_sh->active_source; - impunge_local->cont.lookup.buf = *buf; - afr_update_loc_gfids (&impunge_local->loc, buf, postparent); + afr_update_loc_gfids (&impunge_local->loc, &impunge_sh->entrybuf, + &impunge_sh->parentbuf); + buf = &impunge_sh->entrybuf; + type = buf->ia_type; + + switch (type) { + case IA_IFSOCK: + case IA_IFREG: + case IA_IFBLK: + case IA_IFCHR: + case IA_IFIFO: + case IA_IFLNK: + afr_sh_entry_impunge_check_hardlink (impunge_frame, this, + child_index, buf); + break; + case IA_IFDIR: + afr_sh_entry_impunge_mkdir (impunge_frame, this, + child_index, buf); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "%s has unknown file type on %s: 0%o", + impunge_local->loc.path, + priv->children[active_src]->name, type); + sh->impunge_done (frame, this, -1, EINVAL); + break; + } + + return 0; +} + +int +afr_sh_entry_impunge_create_file (call_frame_t *impunge_frame, xlator_t *this, + int child_index) +{ + call_frame_t *frame = NULL; + afr_local_t *impunge_local = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *impunge_sh = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + ia_type_t type = IA_INVAL; + int active_src = 0; + struct iatt *buf = NULL; + + AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, + frame, local, sh); + active_src = impunge_sh->active_source; + buf = &impunge_sh->entrybuf; type = buf->ia_type; switch (type) { @@ -1557,41 +1721,30 @@ afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, afr_sh_entry_impunge_readlink (impunge_frame, this, child_index, buf); break; - case IA_IFDIR: - afr_sh_entry_impunge_mkdir (impunge_frame, this, - child_index, buf); - break; default: gf_log (this->name, GF_LOG_ERROR, "%s has unknown file type on %s: 0%o", impunge_local->loc.path, priv->children[active_src]->name, type); - ret = -1; + sh->impunge_done (frame, this, -1, EINVAL); break; } - return ret; + return 0; } gf_boolean_t -afr_sh_need_recreate (afr_self_heal_t *impunge_sh, int *sources, - unsigned int child, unsigned int child_count) +afr_sh_need_recreate (afr_self_heal_t *impunge_sh, unsigned int child, + unsigned int child_count) { - int32_t *success_children = NULL; gf_boolean_t recreate = _gf_false; - GF_ASSERT (impunge_sh->impunging_entry_mode); GF_ASSERT (impunge_sh->child_errno); - GF_ASSERT (sources); - success_children = impunge_sh->success_children; - if (sources[child] || (child == impunge_sh->active_source)) { - GF_ASSERT (afr_is_child_present (success_children, - child_count, child)); + if (child == impunge_sh->active_source) goto out; - } - if (IA_ISLNK (impunge_sh->impunging_entry_mode)) { + if (IA_IFLNK == impunge_sh->entrybuf.ia_type) { recreate = _gf_true; goto out; } @@ -1610,7 +1763,7 @@ afr_sh_recreate_count (afr_self_heal_t *impunge_sh, int *sources, int i = 0; for (i = 0; i < child_count; i++) { - if (afr_sh_need_recreate (impunge_sh, sources, i, child_count)) + if (afr_sh_need_recreate (impunge_sh, i, child_count)) count++; } @@ -1627,8 +1780,6 @@ afr_sh_entry_call_impunge_recreate (call_frame_t *impunge_frame, call_frame_t *frame = NULL; afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; - struct iatt *buf = NULL; - struct iatt *postparent = NULL; unsigned int recreate_count = 0; int i = 0; int active_src = 0; @@ -1636,24 +1787,34 @@ afr_sh_entry_call_impunge_recreate (call_frame_t *impunge_frame, priv = this->private; AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, frame, local, sh); - active_src = impunge_sh->active_source; - buf = &impunge_sh->buf[active_src]; - postparent = &impunge_sh->parentbufs[active_src]; - + active_src = impunge_sh->active_source; + impunge_sh->entrybuf = impunge_sh->buf[active_src]; + impunge_sh->parentbuf = impunge_sh->parentbufs[active_src]; recreate_count = afr_sh_recreate_count (impunge_sh, sh->sources, priv->child_count); - GF_ASSERT (recreate_count); + if (!recreate_count) { + afr_sh_entry_call_impunge_done (impunge_frame, this, 0, 0); + goto out; + } impunge_local->call_count = recreate_count; for (i = 0; i < priv->child_count; i++) { - if (afr_sh_need_recreate (impunge_sh, sh->sources, i, - priv->child_count)) { - (void)afr_sh_entry_impunge_create (impunge_frame, this, - i, buf, - postparent); - recreate_count--; + if (!impunge_local->child_up[i]) { + impunge_sh->child_errno[i] = ENOTCONN; + continue; } + if (!afr_sh_need_recreate (impunge_sh, i, priv->child_count)) { + impunge_sh->child_errno[i] = EEXIST; + continue; + } + } + for (i = 0; i < priv->child_count; i++) { + if (!afr_sh_need_recreate (impunge_sh, i, priv->child_count)) + continue; + (void)afr_sh_entry_impunge_create (impunge_frame, this, i); + recreate_count--; } GF_ASSERT (!recreate_count); +out: return 0; } @@ -1667,7 +1828,6 @@ afr_sh_entry_common_lookup_done (call_frame_t *impunge_frame, xlator_t *this, call_frame_t *frame = NULL; afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; - unsigned int recreate_count = 0; unsigned int gfid_miss_count = 0; unsigned int children_up_count = 0; uuid_t gfid = {0}; @@ -1706,20 +1866,19 @@ afr_sh_entry_common_lookup_done (call_frame_t *impunge_frame, xlator_t *this, afr_update_gfid_from_iatts (gfid, impunge_sh->buf, impunge_sh->success_children, priv->child_count); - if (uuid_is_null (gfid)) - uuid_generate (gfid); + if (uuid_is_null (gfid)) { + sh->entries_skipped = _gf_true; + gf_log (this->name, GF_LOG_INFO, "%s: Skipping entry " + "self-heal because of gfid absence", + impunge_local->loc.path); + goto done; + } afr_sh_common_lookup (impunge_frame, this, &impunge_local->loc, afr_sh_entry_common_lookup_done, gfid, AFR_LOOKUP_FAIL_CONFLICTS | - AFR_LOOKUP_FAIL_MISSING_GFIDS); + AFR_LOOKUP_FAIL_MISSING_GFIDS, + NULL); } else { - recreate_count = afr_sh_recreate_count (impunge_sh, sh->sources, - priv->child_count); - if (!recreate_count) { - op_ret = 0; - op_errno = 0; - goto done; - } afr_sh_entry_call_impunge_recreate (impunge_frame, this); } return; @@ -1735,13 +1894,13 @@ afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this, { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; + afr_self_heal_t *impunge_sh = NULL; int ret = -1; call_frame_t *impunge_frame = NULL; afr_local_t *impunge_local = NULL; int active_src = 0; int op_errno = 0; int op_ret = -1; - mode_t entry_mode = 0; local = frame->local; sh = &local->self_heal; @@ -1749,34 +1908,27 @@ afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this, active_src = sh->active_source; sh->impunge_done = afr_sh_entry_impunge_entry_done; - if ((strcmp (entry->d_name, ".") == 0) - || (strcmp (entry->d_name, "..") == 0) - || ((strcmp (local->loc.path, "/") == 0) - && (strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR) == 0))) { - - gf_log (this->name, GF_LOG_TRACE, - "skipping inspection of %s under %s", - entry->d_name, local->loc.path); + if (can_skip_entry_self_heal (entry->d_name, &local->loc)) { op_ret = 0; goto out; } gf_log (this->name, GF_LOG_TRACE, - "inspecting existance of %s under %s", + "inspecting existence of %s under %s", entry->d_name, local->loc.path); - entry_mode = st_mode_from_ia (entry->d_stat.ia_prot, - entry->d_stat.ia_type); - ret = afr_impunge_frame_create (frame, this, active_src, active_src, - entry_mode, &impunge_frame); + ret = afr_impunge_frame_create (frame, this, active_src, + &impunge_frame); if (ret) { op_errno = -ret; goto out; } impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; ret = afr_build_child_loc (this, &impunge_local->loc, &local->loc, entry->d_name); + loc_copy (&impunge_sh->parent_loc, &local->loc); if (ret != 0) { op_errno = ENOMEM; goto out; @@ -1784,14 +1936,14 @@ afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this, afr_sh_common_lookup (impunge_frame, this, &impunge_local->loc, afr_sh_entry_common_lookup_done, NULL, - AFR_LOOKUP_FAIL_CONFLICTS); + AFR_LOOKUP_FAIL_CONFLICTS, NULL); op_ret = 0; out: if (ret) { if (impunge_frame) AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, active_src, op_ret, op_errno); + sh->impunge_done (frame, this, op_ret, op_errno); } return 0; @@ -1802,7 +1954,7 @@ int afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) + gf_dirent_t *entries, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -1825,6 +1977,7 @@ afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, local->loc.path, priv->children[active_src]->name, strerror (op_errno)); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } else { gf_log (this->name, GF_LOG_TRACE, "readdir of %s on subvolume %s complete", @@ -1841,7 +1994,7 @@ afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, entry_count++; } - gf_log (this->name, GF_LOG_TRACE, + gf_log (this->name, GF_LOG_DEBUG, "readdir'ed %d entries from %s", entry_count, priv->children[active_src]->name); @@ -1857,21 +2010,24 @@ afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, int -afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src) +afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; + int32_t active_src = 0; priv = this->private; local = frame->local; sh = &local->self_heal; + active_src = sh->active_source; + gf_log (this->name, GF_LOG_DEBUG, "%s: readdir from offset %zd", + local->loc.path, sh->offset); STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk, priv->children[active_src], priv->children[active_src]->fops->readdirp, - sh->healing_fd, sh->block_size, sh->offset); + sh->healing_fd, sh->block_size, sh->offset, NULL); return 0; } @@ -1894,7 +2050,7 @@ afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) active_src = next_active_source (frame, this, sh->active_source); sh->active_source = active_src; - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { afr_sh_entry_finish (frame, this); return 0; } @@ -1909,7 +2065,7 @@ afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) "impunging entries of %s on %s to other sinks", local->loc.path, priv->children[active_src]->name); - afr_sh_entry_impunge_subvol (frame, this, active_src); + afr_sh_entry_impunge_subvol (frame, this); return 0; } @@ -1917,7 +2073,7 @@ afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) int afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; @@ -1943,7 +2099,7 @@ afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } } UNLOCK (&frame->lock); @@ -1951,7 +2107,7 @@ afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { afr_sh_entry_finish (frame, this); return 0; } @@ -1989,7 +2145,7 @@ afr_sh_entry_open (call_frame_t *frame, xlator_t *this) source = local->self_heal.source; sources = local->self_heal.sources; - sh->block_size = 65536; //131072 + sh->block_size = priv->sh_readdir_size; sh->offset = 0; call_count = sh->active_sinks; @@ -2011,7 +2167,7 @@ afr_sh_entry_open (call_frame_t *frame, xlator_t *this) (void *) (long) source, priv->children[source], priv->children[source]->fops->opendir, - &local->loc, fd); + &local->loc, fd, NULL); call_count--; } @@ -2028,7 +2184,7 @@ afr_sh_entry_open (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->opendir, - &local->loc, fd); + &local->loc, fd, NULL); if (!--call_count) break; @@ -2083,6 +2239,8 @@ afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this) "merging all entries as a conservative decision", local->loc.path); + sh->actual_sh_started = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); afr_sh_entry_open (frame, this); return 0; @@ -2097,15 +2255,15 @@ afr_sh_entry_fix (call_frame_t *frame, xlator_t *this, afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; int source = 0; - - int nsources = 0; + int nsources = 0; + int32_t subvol_status = 0; local = frame->local; sh = &local->self_heal; priv = this->private; if (op_ret < 0) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); afr_sh_entry_finish (frame, this); goto out; @@ -2119,23 +2277,31 @@ afr_sh_entry_fix (call_frame_t *frame, xlator_t *this, nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, sh->sources, sh->success_children, - AFR_ENTRY_TRANSACTION); - if (nsources == 0) { + AFR_ENTRY_TRANSACTION, &subvol_status, + _gf_true); + if ((subvol_status & ALL_FOOLS) || + (subvol_status & SPLIT_BRAIN)) { + gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative " + "merge", local->loc.path); + source = -1; + memset (sh->sources, 0, + sizeof (*sh->sources) * priv->child_count); + } else if (nsources == 0) { gf_log (this->name, GF_LOG_TRACE, "No self-heal needed for %s", local->loc.path); afr_sh_entry_finish (frame, this); return; + } else { + source = afr_sh_select_source (sh->sources, priv->child_count); } - source = afr_sh_select_source (sh->sources, priv->child_count); - sh->source = source; afr_reset_children (sh->fresh_children, priv->child_count); afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); + sh->fresh_children, priv->child_count); if (sh->source >= 0) afr_inode_set_read_ctx (this, sh->inode, sh->source, sh->fresh_children); @@ -2160,7 +2326,7 @@ afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this) if (int_lock->lock_op_ret < 0) { gf_log (this->name, GF_LOG_ERROR, "Non Blocking entrylks " "failed for %s.", local->loc.path); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_entry_done (frame, this); } else { @@ -2169,7 +2335,8 @@ afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this) afr_sh_common_lookup (frame, this, &local->loc, afr_sh_entry_fix, NULL, AFR_LOOKUP_FAIL_CONFLICTS | - AFR_LOOKUP_FAIL_MISSING_GFIDS); + AFR_LOOKUP_FAIL_MISSING_GFIDS, + NULL); } return 0; @@ -2178,14 +2345,18 @@ afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this) int afr_self_heal_entry (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; afr_private_t *priv = NULL; - + afr_self_heal_t *sh = NULL; priv = this->private; local = frame->local; + sh = &local->self_heal; + + sh->sh_type_in_action = AFR_SELF_HEAL_ENTRY; if (local->self_heal.do_entry_self_heal && priv->entry_self_heal) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); afr_sh_entrylk (frame, this, &local->loc, NULL, afr_sh_post_nonblocking_entry_cbk); } else { diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index c67748b2f..fd5da6cfd 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -54,31 +45,17 @@ afr_sh_metadata_done (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; local = frame->local; sh = &local->self_heal; - priv = this->private; - -// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); - memset (sh->buf, 0, sizeof (struct iatt) * priv->child_count); - memset (sh->success, 0, sizeof (*sh->success) * priv->child_count); - afr_reset_xattr (sh->xattr, priv->child_count); - if (local->govinda_gOvinda) { - gf_log (this->name, GF_LOG_INFO, - "split-brain detected, aborting selfheal of %s", + afr_sh_reset (frame, this); + if (IA_ISDIR (sh->type)) { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to entry check on %s", local->loc.path); - sh->op_failed = 1; - sh->completion_cbk (frame, this); + afr_self_heal_entry (frame, this); } else { - if (IA_ISDIR (sh->type)) { - gf_log (this->name, GF_LOG_DEBUG, - "proceeding to entry check on %s", - local->loc.path); - afr_self_heal_entry (frame, this); - return 0; - } gf_log (this->name, GF_LOG_DEBUG, "proceeding to data check on %s", local->loc.path); @@ -88,21 +65,6 @@ afr_sh_metadata_done (call_frame_t *frame, xlator_t *this) return 0; } - -int -afr_sh_metadata_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - int call_count = 0; - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_metadata_done (frame, this); - - return 0; -} - int afr_sh_inode_unlock (call_frame_t *frame, xlator_t *this) { @@ -126,11 +88,24 @@ afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this) return 0; } +int +afr_sh_metadata_fail (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_metadata_finish (frame, this); + return 0; +} int afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) + int32_t op_errno, dict_t *xattr, dict_t *xdata) { afr_local_t *local = NULL; int call_count = 0; @@ -162,85 +137,19 @@ afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie, return 0; } - int afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, - sh->success, priv->child_count, - AFR_METADATA_TRANSACTION); - - erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, - gf_afr_mt_dict_t); - if (!erase_xattr) - return -ENOMEM; - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; - - erase_xattr[i] = get_new_dict(); - dict_ref (erase_xattr[i]); - } - } - - afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, - priv->child_count, AFR_METADATA_TRANSACTION); - - local->call_count = call_count; - - if (call_count == 0) { - gf_log (this->name, GF_LOG_INFO, - "metadata of %s not healed on any subvolume", - local->loc.path); - - afr_sh_metadata_finish (frame, this); - } - - for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "erasing pending flags from %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_erase_pending_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, erase_xattr[i]); - if (!--call_count) - break; - } - - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); - } - } - GF_FREE (erase_xattr); - - return 0; + afr_sh_erase_pending (frame, this, AFR_METADATA_TRANSACTION, + afr_sh_metadata_erase_pending_cbk, + afr_sh_metadata_finish); + return 0; } int afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; @@ -271,8 +180,13 @@ afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); - if (call_count == 0) + if (call_count == 0) { + if (local->xattr_req) { + dict_unref (local->xattr_req); + local->xattr_req = NULL; + } afr_sh_metadata_erase_pending (frame, this); + } return 0; } @@ -281,9 +195,9 @@ afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int afr_sh_metadata_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) + struct iatt *preop, struct iatt *postop, dict_t *xdata) { - afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); + afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno, xdata); return 0; } @@ -291,13 +205,93 @@ afr_sh_metadata_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno, xdata); + + return 0; +} + +int +afr_sh_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t *xdata) { - afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + if (op_ret < 0) { + afr_sh_metadata_sync_cbk (frame, cookie, + this, -1, op_errno, xdata); + goto out; + } + + i = (long) cookie; + + STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->setxattr, + &local->loc, local->xattr_req, 0, NULL); + + out: return 0; } +inline void +afr_prune_special_keys (dict_t *xattr_dict) +{ + dict_del (xattr_dict, GF_SELINUX_XATTR_KEY); +} + +inline void +afr_prune_pending_keys (dict_t *xattr_dict, afr_private_t *priv) +{ + int i = 0; + + for (; i < priv->child_count; i++) { + dict_del (xattr_dict, priv->pending_key[i]); + } +} + +int +afr_sh_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) +{ + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (op_ret < 0) { + afr_sh_metadata_sync_cbk (frame, cookie, + this, -1, op_errno, xdata); + goto out; + } + + afr_prune_pending_keys (xattr, priv); + + afr_prune_special_keys (xattr); + + i = (long) cookie; + + /* send removexattr in bulk via xdata */ + STACK_WIND_COOKIE (frame, afr_sh_removexattr_cbk, + cookie, + priv->children[i], + priv->children[i]->fops->removexattr, + &local->loc, "", xattr); + + out: + return 0; +} int afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) @@ -323,9 +317,10 @@ afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) /* * 2 calls per sink - setattr, setxattr */ - if (xattr) + if (xattr) { call_count = active_sinks * 2; - else + local->xattr_req = dict_ref (xattr); + } else call_count = active_sinks; local->call_count = call_count; @@ -361,18 +356,18 @@ afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) (void *) (long) i, priv->children[i], priv->children[i]->fops->setattr, - &local->loc, &stbuf, valid); + &local->loc, &stbuf, valid, NULL); call_count--; if (!xattr) continue; - STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk, + STACK_WIND_COOKIE (frame, afr_sh_getxattr_cbk, (void *) (long) i, priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, xattr, 0); + priv->children[i]->fops->getxattr, + &local->loc, NULL, NULL); call_count--; } @@ -381,17 +376,15 @@ afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) int -afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) +afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; int source = 0; - int i; - local = frame->local; sh = &local->self_heal; priv = this->private; @@ -406,16 +399,147 @@ afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, afr_sh_metadata_sync (frame, this, NULL); } else { - for (i = 0; i < priv->child_count; i++) { - dict_del (xattr, priv->pending_key[i]); - } - + afr_prune_pending_keys (xattr, priv); afr_sh_metadata_sync (frame, this, xattr); } return 0; } +static void +afr_set_metadata_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, + xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = 0; + char num[1024] = {0}; + size_t len = 0; + char *string = NULL; + size_t off = 0; + char *source_child = " from source %s to"; + char *format = " %s, "; + char *string_msg = " metadata self heal"; + char *pending_matrix_str = NULL; + int down_child_present = 0; + int unknown_child_present = 0; + char *down_subvol_1 = " down subvolume is "; + char *unknown_subvol_1 = " unknown subvolume is"; + char *down_subvol_2 = " down subvolumes are "; + char *unknown_subvol_2 = " unknown subvolumes are "; + int down_count = 0; + int unknown_count = 0; + + priv = this->private; + + pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix, + this); + + if (!pending_matrix_str) + pending_matrix_str = ""; + + len += snprintf (num, sizeof (num), "%s", string_msg); + + for (i = 0; i < priv->child_count; i++) { + if ((sh->source == i) && (local->child_up[i] == 1)) { + len += snprintf (num, sizeof (num), source_child, + priv->children[i]->name); + } else if ((local->child_up[i] == 1) && (sh->sources[i] == 0)) { + len += snprintf (num, sizeof (num), format, + priv->children[i]->name); + } else if (local->child_up[i] == 0) { + len += snprintf (num, sizeof (num), format, + priv->children[i]->name); + if (!down_child_present) + down_child_present = 1; + down_count++; + } else if (local->child_up[i] == -1) { + len += snprintf (num, sizeof (num), format, + priv->children[i]->name); + if (!unknown_child_present) + unknown_child_present = 1; + unknown_count++; + } + } + + if (down_child_present) { + if (down_count > 1) { + len += snprintf (num, sizeof (num), "%s", + down_subvol_2); + } else { + len += snprintf (num, sizeof (num), "%s", + down_subvol_1); + } + } + if (unknown_child_present) { + if (unknown_count > 1) { + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_2); + } else { + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_1); + } + } + + len ++; + + string = GF_CALLOC (len, sizeof (char), gf_common_mt_char); + if (!string) + return; + + off += snprintf (string + off, len - off, "%s", string_msg); + for (i=0; i < priv->child_count; i++) { + if ((sh->source == i) && (local->child_up[i] == 1)) + off += snprintf (string + off, len - off, source_child, + priv->children[i]->name); + } + + for (i = 0; i < priv->child_count; i++) { + if ((local->child_up[i] == 1)&& (sh->sources[i] == 0)) + off += snprintf (string + off, len - off, format, + priv->children[i]->name); + } + + if (down_child_present) { + if (down_count > 1) { + off += snprintf (string + off, len - off, "%s", + down_subvol_2); + } else { + off += snprintf (string + off, len - off, "%s", + down_subvol_1); + } + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 0) + off += snprintf (string + off, len - off, format, + priv->children[i]->name); + } + + if (unknown_child_present) { + if (unknown_count > 1) { + off += snprintf (string + off, len - off, "%s", + unknown_subvol_2); + } else { + off += snprintf (string + off, len - off, "%s", + unknown_subvol_1); + } + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == -1) + off += snprintf (string + off, len - off, format, + priv->children[i]->name); + } + + gf_asprintf (&sh->metadata_sh_info, "%s metadata %s,", string, + pending_matrix_str); + + if (pending_matrix_str && strcmp (pending_matrix_str, "")) + GF_FREE (pending_matrix_str); + + if (string && strcmp (string, "")) + GF_FREE (string); +} int afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) @@ -445,10 +569,13 @@ afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) local->loc.path, priv->children[source]->name, sh->active_sinks); + sh->actual_sh_started = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); + afr_set_metadata_sh_info_str (local, sh, this); STACK_WIND (frame, afr_sh_metadata_getxattr_cbk, priv->children[source], priv->children[source]->fops->getxattr, - &local->loc, NULL); + &local->loc, NULL, NULL); return 0; } @@ -470,7 +597,7 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this, priv = this->private; if (op_ret < 0) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); afr_sh_metadata_finish (frame, this); goto out; @@ -478,16 +605,7 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this, nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, sh->sources, sh->success_children, - AFR_METADATA_TRANSACTION); - if (nsources == 0) { - gf_log (this->name, GF_LOG_TRACE, - "No self-heal needed for %s", - local->loc.path); - - afr_sh_metadata_finish (frame, this); - goto out; - } - + AFR_METADATA_TRANSACTION, NULL, _gf_false); if ((nsources == -1) && (priv->favorite_child != -1) && (sh->child_errno[priv->favorite_child] == 0)) { @@ -504,12 +622,18 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this, } if (nsources == -1) { - gf_log (this->name, GF_LOG_ERROR, - "Unable to self-heal permissions/ownership of '%s' " - "(possible split-brain). Please fix the file on " - "all backend volumes", local->loc.path); + afr_sh_print_split_brain_log (sh->pending_matrix, this, + local->loc.path); + afr_set_split_brain (this, sh->inode, SPB, DONT_KNOW); + afr_sh_metadata_fail (frame, this); + goto out; + } - local->govinda_gOvinda = 1; + afr_set_split_brain (this, sh->inode, NO_SPB, DONT_KNOW); + if (nsources == 0) { + gf_log (this->name, GF_LOG_TRACE, + "No self-heal needed for %s", + local->loc.path); afr_sh_metadata_finish (frame, this); goto out; @@ -548,7 +672,10 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this, sh->fresh_children); } - afr_sh_metadata_sync_prepare (frame, this); + if (sh->do_metadata_self_heal && priv->metadata_self_heal) + afr_sh_metadata_sync_prepare (frame, this); + else + afr_sh_metadata_finish (frame, this); out: return; } @@ -564,9 +691,9 @@ afr_sh_metadata_post_nonblocking_inodelk_cbk (call_frame_t *frame, int_lock = &local->internal_lock; if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "Non Blocking metadata " + gf_log (this->name, GF_LOG_DEBUG, "Non Blocking metadata " "inodelks failed for %s.", local->loc.path); - gf_log (this->name, GF_LOG_ERROR, "Metadata self-heal " + gf_log (this->name, GF_LOG_DEBUG, "Metadata self-heal " "failed for %s.", local->loc.path); afr_sh_metadata_done (frame, this); } else { @@ -577,7 +704,8 @@ afr_sh_metadata_post_nonblocking_inodelk_cbk (call_frame_t *frame, afr_sh_common_lookup (frame, this, &local->loc, afr_sh_metadata_fix, NULL, AFR_LOOKUP_FAIL_CONFLICTS | - AFR_LOOKUP_FAIL_MISSING_GFIDS); + AFR_LOOKUP_FAIL_MISSING_GFIDS, + NULL); } return 0; @@ -587,19 +715,22 @@ int afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; local = frame->local; int_lock = &local->internal_lock; + int_lock->domain = this->name; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); int_lock->transaction_lk_type = AFR_SELFHEAL_LK; int_lock->selfheal_lk_type = AFR_METADATA_SELF_HEAL_LK; afr_set_lock_number (frame, this); - int_lock->lk_flock.l_start = 0; - int_lock->lk_flock.l_len = 0; - int_lock->lk_flock.l_type = F_WRLCK; + inodelk->flock.l_start = LLONG_MAX - 1; + inodelk->flock.l_len = 0; + inodelk->flock.l_type = F_WRLCK; int_lock->lock_cbk = afr_sh_metadata_post_nonblocking_inodelk_cbk; afr_nonblocking_inodelk (frame, this); @@ -607,17 +738,29 @@ afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) return 0; } +gf_boolean_t +afr_can_start_metadata_self_heal (afr_self_heal_t *sh, afr_private_t *priv) +{ + if (sh->force_confirm_spb) + return _gf_true; + if (sh->do_metadata_self_heal && priv->metadata_self_heal) + return _gf_true; + return _gf_false; +} int afr_self_heal_metadata (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = this->private; - + afr_self_heal_t *sh = &local->self_heal; local = frame->local; + sh = &local->self_heal; + sh->sh_type_in_action = AFR_SELF_HEAL_METADATA; - if (local->self_heal.do_metadata_self_heal && priv->metadata_self_heal) { + if (afr_can_start_metadata_self_heal (sh, priv)) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); afr_sh_metadata_lock (frame, this); } else { afr_sh_metadata_done (frame, this); diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index f40c06faa..7c9bc8111 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __AFR_SELF_HEAL_H__ @@ -30,13 +21,6 @@ #define SIZE_GREATER(buf1,buf2) ((buf1)->ia_size > (buf2)->ia_size) int -afr_sh_has_metadata_pending (dict_t *xattr, xlator_t *this); -int -afr_sh_has_entry_pending (dict_t *xattr, xlator_t *this); -int -afr_sh_has_data_pending (dict_t *xattr, xlator_t *this); - -int afr_self_heal_entry (call_frame_t *frame, xlator_t *this); int @@ -54,5 +38,6 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode); int afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, dict_t **xattr, - afr_transaction_type txn_type); + afr_transaction_type txn_type, + uuid_t gfid); #endif /* __AFR_SELF_HEAL_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index afddf62c2..1b48a1bca 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -25,9 +16,1076 @@ #include "syncop.h" #include "afr-self-heald.h" #include "afr-self-heal-common.h" +#include "protocol-common.h" +#include "event-history.h" + +typedef enum { + STOP_CRAWL_ON_SINGLE_SUBVOL = 1 +} afr_crawl_flags_t; + +typedef enum { + HEAL = 1, + INFO, + STATISTICS_TO_BE_HEALED, +} shd_crawl_op; + +typedef struct shd_dump { + dict_t *dict; + xlator_t *this; + int child; +} shd_dump_t; + +typedef struct shd_event_ { + int child; + char *path; +} shd_event_t; + +typedef struct shd_pos_ { + int child; + xlator_t *this; + afr_child_pos_t pos; +} shd_pos_t; + +typedef int +(*afr_crawl_done_cbk_t) (int ret, call_frame_t *sync_frame, void *crawl_data); + +void +afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl, + process_entry_cbk_t process_entry, void *op_data, + gf_boolean_t exclusive, int crawl_flags, + afr_crawl_done_cbk_t crawl_done); + +static int +_crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data); + +/* For calling straight through (e.g. already in a synctask). */ +int +afr_find_child_position (xlator_t *this, int child, afr_child_pos_t *pos); + +/* For deferring through a new synctask. */ +int +afr_syncop_find_child_position (void *data); + +static int +_loc_assign_gfid_path (loc_t *loc) +{ + int ret = -1; + char gfid_path[64] = {0}; + + if (loc->inode && !uuid_is_null (loc->inode->gfid)) { + ret = inode_path (loc->inode, NULL, (char**)&loc->path); + } else if (!uuid_is_null (loc->gfid)) { + snprintf (gfid_path, sizeof (gfid_path), "<gfid:%s>", + uuid_utoa (loc->gfid)); + loc->path = gf_strdup (gfid_path); + if (loc->path) + ret = 0; + } + return ret; +} + +void +_destroy_crawl_event_data (void *data) +{ + shd_crawl_event_t *crawl_event = NULL; + + if (!data) + goto out; + + crawl_event = (shd_crawl_event_t *)data; + GF_FREE (crawl_event->start_time_str); + GF_FREE (crawl_event->end_time_str); + +out: + return; +} + +void +_destroy_shd_event_data (void *data) +{ + shd_event_t *event = NULL; + if (!data) + goto out; + event = (shd_event_t*)data; + GF_FREE (event->path); +out: + return; +} +void +shd_cleanup_event (void *event) +{ + shd_event_t *shd_event = event; + + if (!shd_event) + goto out; + GF_FREE (shd_event->path); + GF_FREE (shd_event); +out: + return; +} + +int +afr_get_local_child (afr_self_heald_t *shd, unsigned int child_count) +{ + int i = 0; + int ret = -1; + for (i = 0; i < child_count; i++) { + if (shd->pos[i] == AFR_POS_LOCAL) { + ret = i; + break; + } + } + return ret; +} + +static int +_build_index_loc (xlator_t *this, loc_t *loc, char *name, loc_t *parent) +{ + int ret = 0; + + uuid_copy (loc->pargfid, parent->inode->gfid); + loc->path = ""; + loc->name = name; + loc->parent = inode_ref (parent->inode); + if (!loc->parent) { + loc->path = NULL; + loc_wipe (loc); + ret = -1; + } + return ret; +} + +int +_add_crawl_stats_to_dict (xlator_t *this, dict_t *output, int child, + shd_crawl_event_t *shd_event, struct timeval *tv) +{ + int ret = 0; + uint64_t count = 0; + char key[256] = {0}; + int xl_id = 0; + uint64_t healed_count = 0; + uint64_t split_brain_count = 0; + uint64_t heal_failed_count = 0; + char *start_time_str = NULL; + char *end_time_str = NULL; + char *crawl_type = NULL; + int progress = -1; + + healed_count = shd_event->healed_count; + split_brain_count = shd_event->split_brain_count; + heal_failed_count = shd_event->heal_failed_count; + start_time_str = shd_event->start_time_str; + end_time_str = shd_event->end_time_str; + crawl_type = shd_event->crawl_type; + + if (!start_time_str) { + ret = -1; + goto out; + } + + + ret = dict_get_int32 (output, this->name, &xl_id); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "xl does not have id"); + goto out; + } + + snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child); + ret = dict_get_uint64 (output, key, &count); + + snprintf (key, sizeof (key), "statistics_healed_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64(output, key, healed_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "healed_count to outout"); + goto out; + } + snprintf (key, sizeof (key), "statistics_sb_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64 (output, key, split_brain_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "split_brain_count to outout"); + goto out; + } + snprintf (key, sizeof (key), "statistics_crawl_type-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_dynstr (output, key, gf_strdup (crawl_type)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "crawl_type to output"); + goto out; + } + snprintf (key, sizeof (key), "statistics_heal_failed_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64 (output, key, heal_failed_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "healed_failed_count to outout"); + goto out; + } + snprintf (key, sizeof (key), "statistics_strt_time-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_dynstr (output, key, gf_strdup(start_time_str)); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "crawl_start_time to outout"); + goto out; + } + + snprintf (key, sizeof (key), "statistics_end_time-%d-%d-%"PRIu64, + xl_id, child, count); + + if (!end_time_str) + end_time_str = "Could not determine the end time"; + ret = dict_set_dynstr (output, key, gf_strdup(end_time_str)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "crawl_end_time to outout"); + goto out; + } + snprintf (key, sizeof (key), "statistics_inprogress-%d-%d-%"PRIu64, + xl_id, child, count); + + if (shd_event->crawl_inprogress == _gf_true) + progress = 1; + else + progress = 0; + + ret = dict_set_int32 (output, key, progress); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "inprogress to outout"); + goto out; + } + + snprintf (key, sizeof (key), "statistics-%d-%d-count",xl_id, child); + ret = dict_set_uint64 (output, key, count + 1); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not increment the " + "counter."); + goto out; + } +out: + return ret; +} + +int +_add_path_to_dict (xlator_t *this, dict_t *output, int child, char *path, + struct timeval *tv, gf_boolean_t dyn) +{ + //subkey not used for now + int ret = -1; + uint64_t count = 0; + char key[256] = {0}; + int xl_id = 0; + + ret = dict_get_int32 (output, this->name, &xl_id); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "xl does not have id"); + goto out; + } + + snprintf (key, sizeof (key), "%d-%d-count", xl_id, child); + ret = dict_get_uint64 (output, key, &count); + + snprintf (key, sizeof (key), "%d-%d-%"PRIu64, xl_id, child, count); + if (dyn) + ret = dict_set_dynstr (output, key, path); + else + ret = dict_set_str (output, key, path); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s: Could not add to output", + path); + goto out; + } + + if (!tv) + goto inc_count; + snprintf (key, sizeof (key), "%d-%d-%"PRIu64"-time", xl_id, + child, count); + ret = dict_set_uint32 (output, key, tv->tv_sec); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s: Could not set time", + path); + goto out; + } + +inc_count: + snprintf (key, sizeof (key), "%d-%d-count", xl_id, child); + ret = dict_set_uint64 (output, key, count + 1); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not increment count"); + goto out; + } + ret = 0; +out: + return ret; +} + +int +_get_path_from_gfid_loc (xlator_t *this, xlator_t *readdir_xl, loc_t *child, + char **fpath, gf_boolean_t *missing) +{ + dict_t *xattr = NULL; + char *path = NULL; + int ret = -1; + + ret = syncop_getxattr (readdir_xl, child, &xattr, GFID_TO_PATH_KEY); + if (ret < 0) { + if ((errno == ENOENT) && missing) + *missing = _gf_true; + goto out; + } + ret = dict_get_str (xattr, GFID_TO_PATH_KEY, &path); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get path for " + "gfid %s", uuid_utoa (child->gfid)); + goto out; + } + path = gf_strdup (path); + if (!path) { + ret = -1; + goto out; + } + ret = 0; +out: + if (!ret) + *fpath = path; + if (xattr) + dict_unref (xattr); + return ret; +} + +int +_add_event_to_dict (circular_buffer_t *cb, void *data) +{ + int ret = 0; + shd_dump_t *dump_data = NULL; + shd_event_t *shd_event = NULL; + + dump_data = data; + shd_event = cb->data; + if (shd_event->child != dump_data->child) + goto out; + ret = _add_path_to_dict (dump_data->this, dump_data->dict, + dump_data->child, shd_event->path, &cb->tv, + _gf_false); +out: + return ret; +} + +int +_add_crawl_event_statistics_to_dict (circular_buffer_t *cb, void *data) +{ + int ret = 0; + shd_dump_t *dump_data = NULL; + shd_crawl_event_t *shd_event = NULL; + + dump_data = data; + shd_event = cb->data; + ret = _add_crawl_stats_to_dict (dump_data->this, dump_data->dict, + dump_data->child, shd_event, &cb->tv); + return ret; +} + +int +_add_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict, int child) +{ + shd_dump_t dump_data = {0}; + + dump_data.this = this; + dump_data.dict = dict; + dump_data.child = child; + eh_dump (eh, &dump_data, _add_event_to_dict); + return 0; +} + + +int +_add_statistics_to_dict (xlator_t *this, dict_t *dict, int child) +{ + shd_dump_t dump_data = {0}; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + + priv = this->private; + shd = &priv->shd; + + dump_data.this = this; + dump_data.dict = dict; + dump_data.child = child; + eh_dump (shd->statistics[child], &dump_data, + _add_crawl_event_statistics_to_dict); + return 0; + +} + +void +_remove_stale_index (xlator_t *this, xlator_t *readdir_xl, + loc_t *parent, char *fname) +{ + int ret = 0; + loc_t index_loc = {0}; + + ret = _build_index_loc (this, &index_loc, fname, parent); + if (ret) + goto out; + gf_log (this->name, GF_LOG_DEBUG, "Removing stale index " + "for %s on %s", index_loc.name, readdir_xl->name); + ret = syncop_unlink (readdir_xl, &index_loc); + if(ret && (errno != ENOENT)) { + gf_log(this->name, GF_LOG_ERROR, "%s: Failed to remove index " + "on %s - %s",index_loc.name, readdir_xl->name, + strerror (errno)); + } + index_loc.path = NULL; + loc_wipe (&index_loc); +out: + return; +} + +int +_count_hard_links_under_base_indices_dir (xlator_t *this, + afr_crawl_data_t *crawl_data, + gf_dirent_t *entry, loc_t *childloc, + loc_t *parentloc, struct iatt *iattr) +{ + xlator_t *readdir_xl = crawl_data->readdir_xl; + struct iatt parent = {0}; + int ret = 0; + dict_t *output = NULL; + int xl_id = 0; + char key[256] = {0}; + int child = -1; + uint64_t hardlinks = 0; + + output = crawl_data->op_data; + child = crawl_data->child; + + ret = syncop_lookup (readdir_xl, childloc, NULL, iattr, NULL, &parent); + if (ret) + goto out; + + ret = dict_get_int32 (output, this->name, &xl_id); + if (ret) + goto out; + + snprintf (key, sizeof (key), "%d-%d-hardlinks", xl_id, child); + ret = dict_get_uint64 (output, key, &hardlinks); + + /*Removing the count of base_entry under indices/base_indicies and + * entry under indices/xattrop */ + hardlinks = hardlinks + iattr->ia_nlink - 2; + ret = dict_set_uint64 (output, key, hardlinks); + if (ret) + goto out; + +out: + return ret; +} + +int +_add_summary_to_dict (xlator_t *this, afr_crawl_data_t *crawl_data, + gf_dirent_t *entry, + loc_t *childloc, loc_t *parentloc, struct iatt *iattr) +{ + dict_t *output = NULL; + xlator_t *readdir_xl = NULL; + int ret = -1; + char *path = NULL; + gf_boolean_t missing = _gf_false; + char gfid_str[64] = {0}; + + if (uuid_is_null (childloc->gfid)) + goto out; + + output = crawl_data->op_data; + readdir_xl = crawl_data->readdir_xl; + + ret = _get_path_from_gfid_loc (this, readdir_xl, childloc, &path, + &missing); + if (ret == 0) { + ret = _add_path_to_dict (this, output, crawl_data->child, path, + NULL, _gf_true); + } else if (missing) { + _remove_stale_index (this, readdir_xl, parentloc, + uuid_utoa_r (childloc->gfid, gfid_str)); + } + +out: + if (ret && path) + GF_FREE (path); + return ret; +} + +void +_crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child, + int32_t op_ret, int32_t op_errno, dict_t *xattr_rsp, + afr_crawl_data_t *crawl_data) +{ + int ret = 0; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + eh_t *eh = NULL; + char *path = NULL; + char gfid_str[64] = {0}; + shd_event_t *event = NULL; + int32_t sh_failed = 0; + gf_boolean_t split_brain = 0; + int32_t actual_sh_done = 0; + shd_crawl_event_t **shd_crawl_event = NULL; + + priv = this->private; + shd = &priv->shd; + if (crawl_data->crawl == INDEX) { + if ((op_ret < 0) && (op_errno == ENOENT)) { + _remove_stale_index (this, crawl_data->readdir_xl, + parent, uuid_utoa_r (child->gfid, + gfid_str)); + goto out; + } + ret = _get_path_from_gfid_loc (this, crawl_data->readdir_xl, + child, &path, NULL); + if (ret) + goto out; + } else { + path = gf_strdup (child->path); + if (!path) { + ret = -1; + goto out; + } + } + + if (xattr_rsp) { + ret = dict_get_int32 (xattr_rsp, "sh-failed", &sh_failed); + ret = dict_get_int32 (xattr_rsp, "actual-sh-done", &actual_sh_done); + } + + shd_crawl_event = (shd_crawl_event_t**)(shd->crawl_events); + + split_brain = afr_is_split_brain (this, child->inode); + if ((op_ret < 0 && op_errno == EIO) || split_brain) { + eh = shd->split_brain; + shd_crawl_event[crawl_data->child]->split_brain_count += 1; + } else if ((op_ret < 0) || sh_failed) { + eh = shd->heal_failed; + shd_crawl_event[crawl_data->child]->heal_failed_count += 1; + } else if (actual_sh_done == 1) { + eh = shd->healed; + shd_crawl_event[crawl_data->child]->healed_count += 1; + } + ret = -1; + + if (eh != NULL) { + event = GF_CALLOC (1, sizeof (*event), gf_afr_mt_shd_event_t); + if (!event) + goto out; + event->child = crawl_data->child; + event->path = path; + + ret = eh_save_history (eh, event); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "%s:Failed to save " + "to event history, (%d, %s)", path, op_ret, + strerror (op_errno)); + + goto out; + } + } else { + gf_log (this->name, GF_LOG_DEBUG, "%s:Self heal already done ", + path); + + } + ret = 0; +out: + if (ret && path) + GF_FREE (path); + return; +} + +int +_link_inode_update_loc (xlator_t *this, loc_t *loc, struct iatt *iattr) +{ + inode_t *link_inode = NULL; + int ret = -1; + + link_inode = inode_link (loc->inode, NULL, NULL, iattr); + if (link_inode == NULL) { + gf_log (this->name, GF_LOG_ERROR, "inode link failed " + "on the inode (%s)", uuid_utoa (iattr->ia_gfid)); + goto out; + } + inode_unref (loc->inode); + loc->inode = link_inode; + ret = 0; +out: + return ret; +} + +int +_self_heal_entry (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *entry, + loc_t *child, loc_t *parent, struct iatt *iattr) +{ + struct iatt parentbuf = {0}; + int ret = 0; + dict_t *xattr_rsp = NULL; + dict_t *xattr_req = NULL; + + xattr_req = dict_new (); + if (!xattr_req) { + errno = ENOMEM; + ret = -1; + goto out; + } + + ret = dict_set_int32 (xattr_req, "allow-sh-for-running-transaction", 1); + + gf_log (this->name, GF_LOG_DEBUG, "lookup %s", child->path); + + ret = syncop_lookup (this, child, xattr_req, + iattr, &xattr_rsp, &parentbuf); + _crawl_post_sh_action (this, parent, child, ret, errno, xattr_rsp, + crawl_data); + if (xattr_rsp) + dict_unref (xattr_rsp); + if (ret == 0) + ret = _link_inode_update_loc (this, child, iattr); + +out: + if (xattr_req) + dict_unref(xattr_req); + return ret; +} + +static int +afr_crawl_done (int ret, call_frame_t *sync_frame, void *data) +{ + GF_FREE (data); + STACK_DESTROY (sync_frame->root); + return 0; +} + +void +_do_self_heal_on_subvol (xlator_t *this, int child, afr_crawl_type_t crawl) +{ + afr_start_crawl (this, child, crawl, _self_heal_entry, + NULL, _gf_true, STOP_CRAWL_ON_SINGLE_SUBVOL, + afr_crawl_done); +} + +gf_boolean_t +_crawl_proceed (xlator_t *this, int child, int crawl_flags, char **reason) +{ + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + gf_boolean_t proceed = _gf_false; + char *msg = NULL; + + priv = this->private; + shd = &priv->shd; + if (!shd->enabled) { + msg = "Self-heal daemon is not enabled"; + gf_log (this->name, GF_LOG_DEBUG, "%s", msg); + goto out; + } + if (!priv->child_up[child]) { + gf_log (this->name, GF_LOG_DEBUG, "Stopping crawl for %s , " + "subvol went down", priv->children[child]->name); + msg = "Brick is Not connected"; + goto out; + } + + if (crawl_flags & STOP_CRAWL_ON_SINGLE_SUBVOL) { + if (afr_up_children_count (priv->child_up, + priv->child_count) < 2) { + gf_log (this->name, GF_LOG_DEBUG, "Stopping crawl as " + "< 2 children are up"); + msg = "< 2 bricks in replica are running"; + goto out; + } + } + proceed = _gf_true; +out: + if (reason) + *reason = msg; + return proceed; +} + +int +_do_crawl_op_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl, + shd_crawl_op op, dict_t *output) +{ + afr_private_t *priv = NULL; + char *status = NULL; + char *subkey = NULL; + char key[256] = {0}; + shd_pos_t pos_data = {0}; + int op_ret = -1; + int xl_id = -1; + int i = 0; + int ret = 0; + int crawl_flags = 0; + + priv = this->private; + if (op == HEAL) + crawl_flags |= STOP_CRAWL_ON_SINGLE_SUBVOL; + + if (output) { + ret = dict_get_int32 (output, this->name, &xl_id); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Invalid input, " + "translator-id is not available"); + goto out; + } + } + pos_data.this = this; + subkey = "status"; + for (i = 0; i < priv->child_count; i++) { + if (_crawl_proceed (this, i, crawl_flags, &status)) { + pos_data.child = i; + /* + * We're already in a synctask in this case, so we + * don't need to defer through a second (and in fact + * that can cause deadlock). Just call straight + * through instead. + */ + ret = afr_find_child_position(pos_data.this, + pos_data.child, + &pos_data.pos); + if (ret) { + status = "Not able to find brick location"; + } else if (pos_data.pos == AFR_POS_REMOTE) { + status = "brick is remote"; + } else { + op_ret = 0; + if (op == HEAL) { + status = "Started self-heal"; + _do_self_heal_on_subvol (this, i, + crawl); + } else if (output && (op == INFO)) { + status = ""; + afr_start_crawl (this, i, INDEX, + _add_summary_to_dict, + output, _gf_false, 0, + NULL); + } else if (output && + (op == STATISTICS_TO_BE_HEALED)) { + status = ""; + afr_start_crawl (this, i, + INDEX_TO_BE_HEALED, + _count_hard_links_under_base_indices_dir, + output, _gf_false, + 0, NULL); + } + } + if (output) { + snprintf (key, sizeof (key), "%d-%d-%s", xl_id, + i, subkey); + ret = dict_set_str (output, key, status); + } + if (!op_ret && (crawl == FULL)) + break; + } + if (output) { + snprintf (key, sizeof (key), "%d-%d-%s", xl_id, i, + subkey); + ret = dict_set_str (output, key, status); + } + } +out: + return op_ret; +} + +int +_do_self_heal_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl, + dict_t *output) +{ + return _do_crawl_op_on_local_subvols (this, crawl, HEAL, output); +} + +int +_get_index_summary_on_local_subvols (xlator_t *this, dict_t *output) +{ + return _do_crawl_op_on_local_subvols (this, INDEX, INFO, output); +} + +void +afr_fill_completed_crawl_statistics_to_dict (xlator_t *this, dict_t *dict) +{ + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int i = 0; + priv = this->private; + shd= &priv->shd; + for (i = 0; i < priv->child_count; i++) { + if (shd->pos[i] != AFR_POS_LOCAL) + continue; + _add_statistics_to_dict (this, dict, i); + } + + return ; +} + +static void +reset_crawl_event (shd_crawl_event_t *crawl_event) +{ + crawl_event->healed_count = 0; + crawl_event->split_brain_count = 0; + crawl_event->heal_failed_count = 0; + GF_FREE (crawl_event->start_time_str); + crawl_event->start_time_str = NULL; + crawl_event->end_time_str = NULL; + crawl_event->crawl_type = NULL; + crawl_event->crawl_inprogress = _gf_false; + return; +} + +static void +afr_copy_crawl_event_struct (shd_crawl_event_t *src, shd_crawl_event_t *dst) +{ + dst->healed_count = src->healed_count; + dst->split_brain_count = src->split_brain_count; + dst->heal_failed_count = src->heal_failed_count; + dst->start_time_str = gf_strdup (src->start_time_str); + dst->end_time_str = "Crawl is already in progress"; + dst->crawl_type = src->crawl_type; + dst->crawl_inprogress = _gf_true; + return; +} + +static int +afr_fill_crawl_statistics_of_running_crawl(xlator_t *this, dict_t *dict) +{ + shd_crawl_event_t *evnt = NULL; + int ret = 0; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int i = 0; + priv = this->private; + shd = &priv->shd; + + evnt = GF_CALLOC (1, sizeof (shd_crawl_event_t), + gf_afr_mt_shd_crawl_event_t); + if (!evnt) { + ret = -1; + goto out; + } + LOCK (&priv->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (shd->pos[i] != AFR_POS_LOCAL) + continue; + + reset_crawl_event (evnt); + + if (!shd->crawl_events[i]) { + continue; + } + + afr_copy_crawl_event_struct (shd->crawl_events[i], + evnt); + _add_crawl_stats_to_dict (this, dict, i, evnt, NULL); + + } + } + UNLOCK (&priv->lock); + reset_crawl_event (evnt); + GF_FREE (evnt); + +out: + return ret; +} static int -_crawl_directory (loc_t *loc, pid_t pid, uuid_t gfid); +_add_local_subvols_crawl_statistics_to_dict (xlator_t *this, dict_t *dict) +{ + int ret = 0; + afr_fill_completed_crawl_statistics_to_dict (this, dict); + ret = afr_fill_crawl_statistics_of_running_crawl (this, dict); + return ret; +} +int +_add_local_subvols_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict) +{ + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int i = 0; + + priv = this->private; + shd = &priv->shd; + + for (i = 0; i < priv->child_count; i++) { + if (shd->pos[i] != AFR_POS_LOCAL) + continue; + _add_eh_to_dict (this, eh, dict, i); + } + return 0; +} + +int +afr_xl_op (xlator_t *this, dict_t *input, dict_t *output) +{ + gf_xl_afr_op_t op = GF_AFR_OP_INVALID; + int ret = 0; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int xl_id = 0; + + priv = this->private; + shd = &priv->shd; + + ret = dict_get_int32 (input, "xl-op", (int32_t*)&op); + if (ret) + goto out; + ret = dict_get_int32 (input, this->name, &xl_id); + if (ret) + goto out; + ret = dict_set_int32 (output, this->name, xl_id); + if (ret) + goto out; + switch (op) { + case GF_AFR_OP_HEAL_INDEX: + ret = _do_self_heal_on_local_subvols (this, INDEX, output); + break; + case GF_AFR_OP_HEAL_FULL: + ret = _do_self_heal_on_local_subvols (this, FULL, output); + break; + case GF_AFR_OP_INDEX_SUMMARY: + (void)_get_index_summary_on_local_subvols (this, output); + ret = 0; + break; + case GF_AFR_OP_HEALED_FILES: + ret = _add_local_subvols_eh_to_dict (this, shd->healed, output); + break; + case GF_AFR_OP_HEAL_FAILED_FILES: + ret = _add_local_subvols_eh_to_dict (this, shd->heal_failed, + output); + break; + case GF_AFR_OP_SPLIT_BRAIN_FILES: + ret = _add_local_subvols_eh_to_dict (this, shd->split_brain, + output); + break; + case GF_AFR_OP_STATISTICS: + ret = _add_local_subvols_crawl_statistics_to_dict (this, output); + break; + case GF_AFR_OP_STATISTICS_HEAL_COUNT: + case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA: + ret = _do_crawl_op_on_local_subvols (this, INDEX_TO_BE_HEALED, + STATISTICS_TO_BE_HEALED, + output); + break; + default: + gf_log (this->name, GF_LOG_ERROR, "Unknown set op %d", op); + break; + } +out: + dict_del (output, this->name); + return ret; +} + +void +afr_poll_self_heal (void *data) +{ + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + struct timespec timeout = {0}; + xlator_t *this = NULL; + long child = (long)data; + gf_timer_t *old_timer = NULL; + gf_timer_t *new_timer = NULL; + shd_pos_t pos_data = {0}; + int ret = 0; + + this = THIS; + priv = this->private; + shd = &priv->shd; + + if (shd->pos[child] == AFR_POS_UNKNOWN) { + pos_data.this = this; + pos_data.child = child; + ret = synctask_new (this->ctx->env, + afr_syncop_find_child_position, + NULL, NULL, &pos_data); + if (!ret) + shd->pos[child] = pos_data.pos; + } + if (shd->enabled && (shd->pos[child] == AFR_POS_LOCAL)) + _do_self_heal_on_subvol (this, child, INDEX); + timeout.tv_sec = shd->timeout; + timeout.tv_nsec = 0; + //notify and previous timer should be synchronized. + LOCK (&priv->lock); + { + old_timer = shd->timer[child]; + if (shd->pos[child] == AFR_POS_REMOTE) + goto unlock; + shd->timer[child] = gf_timer_call_after (this->ctx, timeout, + afr_poll_self_heal, + data); + new_timer = shd->timer[child]; + } +unlock: + UNLOCK (&priv->lock); + + if (old_timer) + gf_timer_call_cancel (this->ctx, old_timer); + if (!new_timer && (shd->pos[child] != AFR_POS_REMOTE)) { + gf_log (this->name, GF_LOG_WARNING, + "Could not create self-heal polling timer for %s", + priv->children[child]->name); + } + return; +} + +static int +afr_handle_child_up (int ret, call_frame_t *sync_frame, void *data) +{ + afr_self_heald_t *shd = NULL; + shd_pos_t *pos_data = data; + afr_private_t *priv = NULL; + + if (ret) + goto out; + + priv = pos_data->this->private; + shd = &priv->shd; + shd->pos[pos_data->child] = pos_data->pos; + if (pos_data->pos != AFR_POS_REMOTE) + afr_poll_self_heal ((void*)(long)pos_data->child); + _do_self_heal_on_local_subvols (THIS, INDEX, NULL); +out: + GF_FREE (data); + return 0; +} + +void +afr_proactive_self_heal (void *data) +{ + xlator_t *this = NULL; + long child = (long)data; + shd_pos_t *pos_data = NULL; + int ret = 0; + + this = THIS; + + //Position of brick could have changed and it could be local now. + //Compute the position again + pos_data = GF_CALLOC (1, sizeof (*pos_data), gf_afr_mt_pos_data_t); + if (!pos_data) + goto out; + pos_data->this = this; + pos_data->child = child; + ret = synctask_new (this->ctx->env, afr_syncop_find_child_position, + afr_handle_child_up, NULL, pos_data); + if (ret) + goto out; +out: + return; +} + static int get_pathinfo_host (char *pathinfo, char *hostname, size_t size) { @@ -84,145 +1142,300 @@ out: return ret; } -inline void -afr_generate_gfid_on_empty (uuid_t gfid) +int +afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data, + loc_t *dirloc) +{ + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + void *index_gfid = NULL; + void *base_indices_holder_vgfid = NULL; + loc_t rootloc = {0}; + struct iatt iattr = {0}; + struct iatt parent = {0}; + int ret = 0; + xlator_t *readdir_xl = crawl_data->readdir_xl; + + priv = this->private; + if (crawl_data->crawl == FULL) { + afr_build_root_loc (this, dirloc); + } else if (crawl_data->crawl == INDEX) { + afr_build_root_loc (this, &rootloc); + ret = syncop_getxattr (readdir_xl, &rootloc, &xattr, + GF_XATTROP_INDEX_GFID); + if (ret < 0) + goto out; + ret = dict_get_ptr (xattr, GF_XATTROP_INDEX_GFID, &index_gfid); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "failed to get index " + "dir gfid on %s", readdir_xl->name); + goto out; + } + if (!index_gfid) { + gf_log (this->name, GF_LOG_ERROR, "index gfid empty " + "on %s", readdir_xl->name); + ret = -1; + goto out; + } + uuid_copy (dirloc->gfid, index_gfid); + dirloc->path = ""; + dirloc->inode = inode_new (priv->root_inode->table); + ret = syncop_lookup (readdir_xl, dirloc, NULL, + &iattr, NULL, &parent); + if (ret < 0) { + if (errno != ENOENT) { + gf_log (this->name, GF_LOG_ERROR, "lookup " + "failed on index dir on %s - (%s)", + readdir_xl->name, strerror (errno)); + } + goto out; + } + ret = _link_inode_update_loc (this, dirloc, &iattr); + if (ret) + goto out; + } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) { + afr_build_root_loc (this, &rootloc); + ret = syncop_getxattr (readdir_xl, &rootloc, &xattr, + GF_BASE_INDICES_HOLDER_GFID); + if (ret < 0) + goto out; + ret = dict_get_ptr (xattr, GF_BASE_INDICES_HOLDER_GFID, + &base_indices_holder_vgfid); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "index gfid empty " + "on %s", readdir_xl->name); + ret = -1; + goto out; + } + if (!base_indices_holder_vgfid) { + gf_log (this->name, GF_LOG_ERROR, "Base indices holder" + "virtual gfid is null on %s", readdir_xl->name); + ret = -1; + goto out; + } + uuid_copy (dirloc->gfid, base_indices_holder_vgfid); + dirloc->path = ""; + dirloc->inode = inode_new (priv->root_inode->table); + ret = syncop_lookup (readdir_xl, dirloc, NULL, &iattr, NULL, + &parent); + if (ret < 0) { + if (errno != ENOENT) { + gf_log (this->name, GF_LOG_ERROR, "lookup " + "failed for base_indices_holder dir" + " on %s - (%s)", readdir_xl->name, + strerror (errno)); + + } else { + gf_log (this->name, GF_LOG_ERROR, "base_indices" + "_holder is not yet created."); + } + goto out; + } + ret = _link_inode_update_loc (this, dirloc, &iattr); + if (ret) + goto out; + } + ret = 0; +out: + if (xattr) + dict_unref (xattr); + loc_wipe (&rootloc); + return ret; +} + +int +afr_crawl_opendir (xlator_t *this, afr_crawl_data_t *crawl_data, fd_t **dirfd, + loc_t *dirloc) { - if (uuid_is_null (gfid)) - uuid_generate (gfid); + fd_t *fd = NULL; + int ret = 0; + + if (crawl_data->crawl == FULL) { + fd = fd_create (dirloc->inode, crawl_data->pid); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to create fd for %s", dirloc->path); + ret = -1; + goto out; + } + + ret = syncop_opendir (crawl_data->readdir_xl, dirloc, fd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "opendir failed on %s", dirloc->path); + goto out; + } + } else { + fd = fd_anonymous (dirloc->inode); + } + ret = 0; +out: + if (!ret) + *dirfd = fd; + return ret; } -inline void -afr_empty_gfid_on_set (uuid_t gfid, int lookup_status, struct iatt *iatt) +xlator_t* +afr_crawl_readdir_xl_get (xlator_t *this, afr_crawl_data_t *crawl_data) { - if (lookup_status || !uuid_compare (gfid, iatt->ia_gfid)) - uuid_clear (gfid); + afr_private_t *priv = this->private; + + if (crawl_data->crawl == FULL) { + return this; + } else { + return priv->children[crawl_data->child]; + } + return NULL; } -inline void -afr_fill_loc_info (loc_t *loc, struct iatt *iatt, struct iatt *parent) +int +afr_crawl_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, + gf_dirent_t *entry, afr_crawl_data_t *crawl_data) { - afr_update_loc_gfids (loc, iatt, parent); - uuid_copy (loc->inode->gfid, iatt->ia_gfid); + int ret = -1; + afr_private_t *priv = NULL; + + priv = this->private; + if (crawl_data->crawl == FULL) { + ret = afr_build_child_loc (this, child, parent, entry->d_name); + } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) { + ret = _build_index_loc (this, child, entry->d_name, parent); + if (ret) + goto out; + child->inode = inode_new (priv->root_inode->table); + if (!child->inode) { + ret = -1; + goto out; + } + child->path = NULL; + } else { + child->inode = inode_new (priv->root_inode->table); + if (!child->inode) + goto out; + uuid_parse (entry->d_name, child->gfid); + ret = _loc_assign_gfid_path (child); + } +out: + return ret; } static int -_perform_self_heal (xlator_t *this, loc_t *parentloc, gf_dirent_t *entries, - uuid_t gfid, off_t *offset, pid_t pid) +_process_entries (xlator_t *this, loc_t *parentloc, gf_dirent_t *entries, + off_t *offset, afr_crawl_data_t *crawl_data) { gf_dirent_t *entry = NULL; gf_dirent_t *tmp = NULL; - struct iatt iatt = {0}; - struct iatt parent = {0};; int ret = 0; loc_t entry_loc = {0}; - dict_t *xattr_req = NULL; - - xattr_req = dict_new (); - if (!xattr_req) { - ret = -1; - goto out; - } + fd_t *fd = NULL; + struct iatt iattr = {0}; list_for_each_entry_safe (entry, tmp, &entries->list, list) { + if (!_crawl_proceed (this, crawl_data->child, + crawl_data->crawl_flags, NULL)) { + ret = -1; + goto out; + } *offset = entry->d_off; if (IS_ENTRY_CWD (entry->d_name) || IS_ENTRY_PARENT (entry->d_name)) continue; - - ret = dict_reset (xattr_req); - if (ret) - goto out; + if ((crawl_data->crawl == FULL) && + uuid_is_null (entry->d_stat.ia_gfid)) { + gf_log (this->name, GF_LOG_WARNING, "%s/%s: No " + "gfid present skipping", + parentloc->path, entry->d_name); + continue; + } loc_wipe (&entry_loc); - ret = afr_build_child_loc (this, &entry_loc, - parentloc, entry->d_name); + ret = afr_crawl_build_child_loc (this, &entry_loc, parentloc, + entry, crawl_data); if (ret) goto out; - afr_generate_gfid_on_empty (gfid); - ret = afr_set_dict_gfid (xattr_req, gfid); + ret = crawl_data->process_entry (this, crawl_data, entry, + &entry_loc, parentloc, &iattr); + + if (crawl_data->crawl == INDEX_TO_BE_HEALED && ret) { + goto out; + } else if (ret) { + continue; + } + + if ((crawl_data->crawl == INDEX) || + (crawl_data->crawl == INDEX_TO_BE_HEALED)) + continue; + + if (!IA_ISDIR (iattr.ia_type)) + continue; + fd = NULL; + ret = afr_crawl_opendir (this, crawl_data, &fd, &entry_loc); if (ret) - goto out; - gf_log (this->name, GF_LOG_DEBUG, "lookup %s", entry_loc.path); - - ret = syncop_lookup (this, &entry_loc, xattr_req, - &iatt, NULL, &parent); - afr_empty_gfid_on_set (gfid, ret, &iatt); - //Don't fail the crawl if lookup fails as it - //could be because of split-brain - if (ret || (!IA_ISDIR (iatt.ia_type))) continue; - afr_fill_loc_info (&entry_loc, &iatt, &parent); - ret = _crawl_directory (&entry_loc, pid, gfid); + ret = _crawl_directory (fd, &entry_loc, crawl_data); + if (fd) + fd_unref (fd); } ret = 0; out: - if (xattr_req) - dict_unref (xattr_req); - if (entry_loc.path) - loc_wipe (&entry_loc); + if ((crawl_data->crawl == INDEX_TO_BE_HEALED) && ret) { + gf_log (this->name, GF_LOG_ERROR,"Failed to get the hardlink " + "count"); + } + loc_wipe (&entry_loc); return ret; } static int -_crawl_directory (loc_t *loc, pid_t pid, uuid_t gfid) +_crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data) { xlator_t *this = NULL; - afr_private_t *priv = NULL; - fd_t *fd = NULL; off_t offset = 0; gf_dirent_t entries; - struct iatt iatt = {0}; - struct iatt parent = {0};; int ret = 0; gf_boolean_t free_entries = _gf_false; + xlator_t *readdir_xl = crawl_data->readdir_xl; INIT_LIST_HEAD (&entries.list); this = THIS; - priv = this->private; GF_ASSERT (loc->inode); - gf_log (this->name, GF_LOG_DEBUG, "crawling %s", loc->path); - fd = fd_create (loc->inode, pid); - if (!fd) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to create fd for %s", loc->path); - goto out; - } - - if (!loc->parent) { - ret = syncop_lookup (this, loc, NULL, - &iatt, NULL, &parent); - } - - ret = syncop_opendir (this, loc, fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "opendir failed on %s", loc->path); - goto out; - } + if (crawl_data->crawl == FULL) + gf_log (this->name, GF_LOG_DEBUG, "crawling %s", loc->path); + else + gf_log (this->name, GF_LOG_DEBUG, "crawling INDEX %s", + uuid_utoa (loc->gfid)); - while (syncop_readdirp (this, fd, 131072, offset, &entries)) { + while (1) { + if (crawl_data->crawl == FULL) + ret = syncop_readdirp (readdir_xl, fd, 131072, offset, + NULL, &entries); + else + ret = syncop_readdir (readdir_xl, fd, 131072, offset, + &entries); + if (ret <= 0) + break; ret = 0; free_entries = _gf_true; - if (afr_up_children_count (priv->child_up, - priv->child_count) < 2) { - gf_log (this->name, GF_LOG_ERROR, "Stopping crawl as " - "< 2 children are up"); + + if (!_crawl_proceed (this, crawl_data->child, + crawl_data->crawl_flags, NULL)) { ret = -1; goto out; } - if (list_empty (&entries.list)) goto out; - ret = _perform_self_heal (this, loc, &entries, gfid, &offset, pid); + ret = _process_entries (this, loc, &entries, &offset, + crawl_data); + if ((ret < 0) && (crawl_data->crawl == INDEX_TO_BE_HEALED)) { + goto out; + } gf_dirent_free (&entries); free_entries = _gf_false; } - if (fd) - fd_unref (fd); ret = 0; out: if (free_entries) @@ -230,283 +1443,333 @@ out: return ret; } +static char* +position_str_get (afr_child_pos_t pos) +{ + switch (pos) { + case AFR_POS_UNKNOWN: + return "unknown"; + case AFR_POS_LOCAL: + return "local"; + case AFR_POS_REMOTE: + return "remote"; + } + return NULL; +} + int -afr_find_child_position (xlator_t *this, int child) +afr_find_child_position (xlator_t *this, int child, afr_child_pos_t *pos) { afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; dict_t *xattr_rsp = NULL; loc_t loc = {0}; int ret = 0; - gf_boolean_t local = _gf_false; - char *pathinfo = NULL; - afr_child_pos_t *pos = NULL; - inode_table_t *itable = NULL; + char *node_uuid = NULL; priv = this->private; - pos = &priv->shd.pos[child]; - - if (*pos != AFR_POS_UNKNOWN) { - goto out; - } - - //TODO: Hack to make the root_loc hack work - LOCK (&priv->lock); - { - if (!priv->root_inode) { - itable = inode_table_new (0, this); - if (!itable) - goto unlock; - priv->root_inode = inode_new (itable); - if (!priv->root_inode) - goto unlock; - } - } -unlock: - UNLOCK (&priv->lock); + shd = &priv->shd; - if (!priv->root_inode) { - ret = -1; - goto out; - } - afr_build_root_loc (priv->root_inode, &loc); + afr_build_root_loc (this, &loc); ret = syncop_getxattr (priv->children[child], &loc, &xattr_rsp, - GF_XATTR_PATHINFO_KEY); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "getxattr failed on child " - "%d", child); + GF_XATTR_NODE_UUID_KEY); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "getxattr failed on %s - " + "(%s)", priv->children[child]->name, strerror (errno)); goto out; } - ret = dict_get_str (xattr_rsp, GF_XATTR_PATHINFO_KEY, &pathinfo); + ret = dict_get_str (xattr_rsp, GF_XATTR_NODE_UUID_KEY, &node_uuid); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Pathinfo key not found on " - "child %d", child); + gf_log (this->name, GF_LOG_ERROR, "node-uuid key not found on " + "child %s", priv->children[child]->name); goto out; } - ret = afr_local_pathinfo (pathinfo, &local); - if (ret) - goto out; - if (local) + if (!strcmp (node_uuid, shd->node_uuid)) *pos = AFR_POS_LOCAL; else *pos = AFR_POS_REMOTE; - gf_log (this->name, GF_LOG_INFO, "child %d is %d", child, *pos); + gf_log (this->name, GF_LOG_DEBUG, "child %s is %s", + priv->children[child]->name, position_str_get (*pos)); out: + if (ret) + *pos = AFR_POS_UNKNOWN; + loc_wipe (&loc); return ret; } -static int -afr_crawl_done (int ret, call_frame_t *sync_frame, void *data) +int +afr_syncop_find_child_position (void *data) { - GF_FREE (data); - STACK_DESTROY (sync_frame->root); - return 0; + shd_pos_t *pos_data = data; + int ret = 0; + + ret = afr_find_child_position (pos_data->this, pos_data->child, + &pos_data->pos); + return ret; } static int -afr_find_all_children_postions (xlator_t *this) +afr_dir_crawl (void *data) { - int ret = -1; - int i = 0; - gf_boolean_t succeeded = _gf_false; - afr_private_t *priv = NULL; + xlator_t *this = NULL; + int ret = -1; + xlator_t *readdir_xl = NULL; + fd_t *fd = NULL; + loc_t dirloc = {0}; + afr_crawl_data_t *crawl_data = data; - priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (priv->child_up[i] != 1) - continue; - ret = afr_find_child_position (this, i); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to determine if the " - "child %s is local.", - priv->children[i]->name); - continue; + this = THIS; + + if (!_crawl_proceed (this, crawl_data->child, crawl_data->crawl_flags, + NULL)) + goto out; + + readdir_xl = afr_crawl_readdir_xl_get (this, crawl_data); + if (!readdir_xl) + goto out; + crawl_data->readdir_xl = readdir_xl; + + ret = afr_crawl_build_start_loc (this, crawl_data, &dirloc); + if (ret) + goto out; + + ret = afr_crawl_opendir (this, crawl_data, &fd, &dirloc); + if (ret) { + if (crawl_data->crawl == INDEX_TO_BE_HEALED) { + gf_log (this->name, GF_LOG_ERROR, "Failed to open base_" + "indices_holder"); } - succeeded = _gf_true; + goto out; } - if (succeeded) - ret = 0; + + ret = _crawl_directory (fd, &dirloc, crawl_data); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Crawl failed on %s", + readdir_xl->name); + else + gf_log (this->name, GF_LOG_DEBUG, "Crawl completed " + "on %s", readdir_xl->name); + if (crawl_data->crawl == INDEX) + dirloc.path = NULL; +out: + if (fd) + fd_unref (fd); + if ((crawl_data->crawl == INDEX) || + (crawl_data->crawl == INDEX_TO_BE_HEALED )) + dirloc.path = NULL; + loc_wipe (&dirloc); return ret; } -static gf_boolean_t -afr_local_child_exists (afr_child_pos_t *pos, unsigned int child_count) +char * +get_crawl_type_in_string (afr_crawl_type_t crawl) { - int i = 0; - gf_boolean_t local = _gf_false; - - for (i = 0; i < child_count; i++, pos++) { - if (*pos == AFR_POS_LOCAL) { - local = _gf_true; - break; - } + char *index = "INDEX"; + char *full = "FULL"; + char *crawl_type = NULL; + + if (crawl == INDEX){ + crawl_type = index; + } else if (crawl == FULL) { + crawl_type = full; } - return local; + + return crawl_type; } -int -afr_init_child_position (xlator_t *this, int child) +static int +afr_allocate_crawl_event (xlator_t *this, int child, afr_crawl_type_t crawl) { - int ret = 0; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int ret = 0; + shd_crawl_event_t *crawl_event = NULL; + time_t get_time = 0; - if (child == AFR_ALL_CHILDREN) { - ret = afr_find_all_children_postions (this); - } else { - ret = afr_find_child_position (this, child); + priv = this->private; + shd = &priv->shd; + + crawl_event = GF_CALLOC (sizeof (shd_crawl_event_t), 1, + gf_afr_mt_shd_crawl_event_t); + if (!crawl_event) { + ret = -1; + goto out; + } + + get_time = time(NULL); + if (get_time == ((time_t)-1)) { + ret = -1; + goto out; + } + + crawl_event->start_time_str = gf_strdup (ctime(&get_time)); + + crawl_event->crawl_type = get_crawl_type_in_string (crawl); + if (!crawl_event->crawl_type) { + ret = -1; + goto out; + } + LOCK (&priv->lock); + { + shd->crawl_events[child] = crawl_event; } + UNLOCK (&priv->lock); + ret = 0; +out: return ret; + } -int -afr_is_local_child (afr_self_heald_t *shd, int child, unsigned int child_count) +static int +afr_put_crawl_event_in_eh (xlator_t *this, int child) { - gf_boolean_t local = _gf_false; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int ret = 0; + time_t get_time = 0; + shd_crawl_event_t **crawl_event = NULL; - if (child == AFR_ALL_CHILDREN) - local = afr_local_child_exists (shd->pos, child_count); - else - local = (shd->pos[child] == AFR_POS_LOCAL); + priv = this->private; + shd = &priv->shd; - return local; + get_time = time(NULL); + if (get_time == ((time_t)-1)) { + ret = -1; + goto out; + } + crawl_event = (shd_crawl_event_t**)shd->crawl_events; + LOCK (&priv->lock); + { + crawl_event[child]->end_time_str = gf_strdup (ctime(&get_time)); + ret = eh_save_history (shd->statistics[child], + crawl_event[child]); + crawl_event[child] = NULL; + } + UNLOCK (&priv->lock); +out: + return ret; } static int -afr_crawl_directory (xlator_t *this, pid_t pid) +afr_dir_exclusive_crawl (void *data) { afr_private_t *priv = NULL; afr_self_heald_t *shd = NULL; - loc_t loc = {0}; gf_boolean_t crawl = _gf_false; - int ret = 0; - uuid_t gfid = {0}; + int ret = 0; + int child = -1; + xlator_t *this = NULL; + afr_crawl_data_t *crawl_data = data; + this = THIS; priv = this->private; shd = &priv->shd; - + child = crawl_data->child; LOCK (&priv->lock); { - if (shd->inprogress) { - shd->pending = _gf_true; + if (shd->inprogress[child]) { + if (shd->pending[child] != FULL) + shd->pending[child] = crawl_data->crawl; } else { - shd->inprogress = _gf_true; + shd->inprogress[child] = _gf_true; crawl = _gf_true; } } UNLOCK (&priv->lock); - if (!priv->root_inode) { - ret = -1; + if (!crawl) { + gf_log (this->name, GF_LOG_INFO, "Another crawl is in progress " + "for %s", priv->children[child]->name); goto out; } - if (!crawl) - goto out; - - afr_build_root_loc (priv->root_inode, &loc); - while (crawl) { - ret = _crawl_directory (&loc, pid, gfid); + do { + ret = afr_allocate_crawl_event (this, child, crawl_data->crawl); if (ret) - gf_log (this->name, GF_LOG_ERROR, "Crawl failed"); - else - gf_log (this->name, GF_LOG_INFO, "Crawl completed"); + goto out; + afr_dir_crawl (data); + + ret = afr_put_crawl_event_in_eh (this, child); + if (ret < 0) + goto out; + LOCK (&priv->lock); { - if (shd->pending) { - shd->pending = _gf_false; + if (shd->pending[child] != NONE) { + crawl_data->crawl = shd->pending[child]; + shd->pending[child] = NONE; } else { - shd->inprogress = _gf_false; + shd->inprogress[child] = _gf_false; crawl = _gf_false; } } UNLOCK (&priv->lock); - } -out: - return ret; -} - -static int -afr_crawl (void *data) -{ - xlator_t *this = NULL; - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - int ret = -1; - afr_crawl_data_t *crawl_data = data; - - this = THIS; - priv = this->private; - shd = &priv->shd; - - ret = afr_init_child_position (this, crawl_data->child); - if (ret) - goto out; - - if (!afr_is_local_child (shd, crawl_data->child, priv->child_count)) - goto out; - - ret = afr_crawl_directory (this, crawl_data->pid); + } while (crawl); out: return ret; } void -afr_proactive_self_heal (xlator_t *this, int idx) +afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl, + process_entry_cbk_t process_entry, void *op_data, + gf_boolean_t exclusive, int crawl_flags, + afr_crawl_done_cbk_t crawl_done) { afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; call_frame_t *frame = NULL; afr_crawl_data_t *crawl_data = NULL; int ret = 0; + int (*crawler) (void*) = NULL; priv = this->private; - shd = &priv->shd; - if (!shd->enabled) - goto out; - - if ((idx != AFR_ALL_CHILDREN) && - (shd->pos[idx] == AFR_POS_REMOTE)) - goto out; frame = create_frame (this, this->ctx->pool); if (!frame) goto out; - afr_set_lk_owner (frame, this); + afr_set_lk_owner (frame, this, frame->root); afr_set_low_priority (frame); crawl_data = GF_CALLOC (1, sizeof (*crawl_data), - gf_afr_mt_afr_crawl_data_t); + gf_afr_mt_crawl_data_t); if (!crawl_data) goto out; + crawl_data->process_entry = process_entry; crawl_data->child = idx; crawl_data->pid = frame->root->pid; - gf_log (this->name, GF_LOG_INFO, "starting crawl for %d", idx); - ret = synctask_new (this->ctx->env, afr_crawl, - afr_crawl_done, frame, crawl_data); + crawl_data->crawl = crawl; + crawl_data->op_data = op_data; + crawl_data->crawl_flags = crawl_flags; + gf_log (this->name, GF_LOG_DEBUG, "starting crawl %d for %s", + crawl_data->crawl, priv->children[idx]->name); + + if (exclusive) + crawler = afr_dir_exclusive_crawl; + else + crawler = afr_dir_crawl; + ret = synctask_new (this->ctx->env, crawler, + crawl_done, frame, crawl_data); if (ret) - gf_log (this->name, GF_LOG_ERROR, "Could not create the " - "task for %d ret %d", idx, ret); + gf_log (this->name, GF_LOG_ERROR, "afr crawl failed for child" + " %d with ret %d", idx, ret); out: return; } -//TODO: This is a hack void -afr_build_root_loc (inode_t *inode, loc_t *loc) +afr_build_root_loc (xlator_t *this, loc_t *loc) { - loc->path = "/"; - loc->name = ""; - loc->inode = inode; - loc->ino = 1; - loc->inode->ino = 1; - loc->inode->ia_type = IA_IFDIR; - memset (loc->inode->gfid, 0, 16); - loc->inode->gfid[15] = 1; + afr_private_t *priv = NULL; + priv = this->private; + loc->path = gf_strdup ("/"); + loc->name = ""; + loc->inode = inode_ref (priv->root_inode); + uuid_copy (loc->gfid, loc->inode->gfid); } int @@ -522,4 +1785,3 @@ afr_set_root_gfid (dict_t *dict) return ret; } - diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h index d5f9552c1..e0c083754 100644 --- a/xlators/cluster/afr/src/afr-self-heald.h +++ b/xlators/cluster/afr/src/afr-self-heald.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __AFR_SELF_HEALD_H__ @@ -27,23 +18,48 @@ #define AFR_ALL_CHILDREN -1 typedef struct afr_crawl_data_ { - int child; - pid_t pid; + int child; + pid_t pid; + afr_crawl_type_t crawl; + xlator_t *readdir_xl; + void *op_data; + int crawl_flags; + int (*process_entry) (xlator_t *this, struct afr_crawl_data_ *crawl_data, + gf_dirent_t *entry, loc_t *child, loc_t *parent, + struct iatt *iattr); } afr_crawl_data_t; -void afr_proactive_self_heal (xlator_t *this, int idx); +typedef struct crawl_event_stats_ { + uint64_t healed_count; + uint64_t split_brain_count; + uint64_t heal_failed_count; + char *start_time_str; + char *end_time_str; + char *crawl_type; + gf_boolean_t crawl_inprogress; +} shd_crawl_event_t; -void afr_build_root_loc (inode_t *inode, loc_t *loc); +void _destroy_crawl_event_data (void *data); +void _destroy_shd_event_data (void *data); -int afr_set_root_gfid (dict_t *dict); +typedef int (*process_entry_cbk_t) (xlator_t *this, afr_crawl_data_t *crawl_data, + gf_dirent_t *entry, loc_t *child, loc_t *parent, + struct iatt *iattr); -inline void -afr_generate_gfid_on_empty (uuid_t gfid); +void afr_build_root_loc (xlator_t *this, loc_t *loc); -inline void -afr_empty_gfid_on_set (uuid_t gfid, int lookup_status, struct iatt *iatt); +int afr_set_root_gfid (dict_t *dict); -inline void -afr_fill_loc_info (loc_t *loc, struct iatt *iatt, struct iatt *parent); +void +afr_proactive_self_heal (void *data); +int +afr_xl_op (xlator_t *this, dict_t *input, dict_t *output); + +/* + * In addition to its self-heal use, this is used to find a local default + * read_child. + */ +int +afr_local_pathinfo (char *pathinfo, gf_boolean_t *local); #endif /* __AFR_SELF_HEALD_H__ */ diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index d3960dcff..20306e469 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -1,25 +1,17 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include "dict.h" #include "byte-order.h" #include "common-utils.h" +#include "timer.h" #include "afr.h" #include "afr-transaction.h" @@ -32,48 +24,75 @@ of RENAME */ #define LOCKED_LOWER 0x2 /* for lower_path of RENAME */ - afr_fd_ctx_t * -afr_fd_ctx_get (fd_t *fd, xlator_t *this) +__afr_fd_ctx_get (fd_t *fd, xlator_t *this) { uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; int ret = 0; + afr_fd_ctx_t *fd_ctx = NULL; + int i = 0; + afr_private_t *priv = NULL; - ret = fd_ctx_get (fd, this, &ctx); + priv = this->private; - if (ret < 0) - goto out; + ret = __fd_ctx_get (fd, this, &ctx); - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + if (ret < 0 && fd_is_anonymous (fd)) { + ret = __afr_fd_ctx_set (this, fd); + if (ret < 0) + goto out; + ret = __fd_ctx_get (fd, this, &ctx); + if (ret < 0) + goto out; + + fd_ctx = (afr_fd_ctx_t *)(long) ctx; + for (i = 0; i < priv->child_count; i++) + fd_ctx->opened_on[i] = AFR_FD_OPENED; + } + + fd_ctx = (afr_fd_ctx_t *)(long) ctx; out: return fd_ctx; } +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ + afr_fd_ctx_t *fd_ctx = NULL; + + LOCK(&fd->lock); + { + fd_ctx = __afr_fd_ctx_get (fd, this); + } + UNLOCK(&fd->lock); + + return fd_ctx; +} + + static void -afr_pid_save (call_frame_t *frame) +afr_save_lk_owner (call_frame_t *frame) { afr_local_t * local = NULL; local = frame->local; - local->saved_pid = frame->root->pid; + local->saved_lk_owner = frame->root->lk_owner; } static void -afr_pid_restore (call_frame_t *frame) +afr_restore_lk_owner (call_frame_t *frame) { afr_local_t * local = NULL; local = frame->local; - frame->root->pid = local->saved_pid; + frame->root->lk_owner = local->saved_lk_owner; } - static void __mark_all_pending (int32_t *pending[], int child_count, afr_transaction_type type) @@ -126,51 +145,23 @@ out: return; } - static void -__mark_pre_op_undone_on_fd (call_frame_t *frame, xlator_t *this, int child_index) -{ - afr_local_t *local = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - - local = frame->local; - - if (!local->fd) - return; - - fd_ctx = afr_fd_ctx_get (local->fd, this); - - if (!fd_ctx) - goto out; - - LOCK (&local->fd->lock); - { - if (local->transaction.type == AFR_DATA_TRANSACTION) - fd_ctx->pre_op_done[child_index]--; - } - UNLOCK (&local->fd->lock); -out: - return; -} - - -static void -__mark_down_children (int32_t *pending[], int child_count, - unsigned char *child_up, afr_transaction_type type) +__mark_non_participant_children (int32_t *pending[], int child_count, + unsigned char *participants, + afr_transaction_type type) { int i = 0; int j = 0; + j = afr_index_for_transaction_type (type); for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); - - if (!child_up[i]) + if (!participants[i]) pending[i][j] = 0; } } -static void +void __mark_all_success (int32_t *pending[], int child_count, afr_transaction_type type) { @@ -183,6 +174,54 @@ __mark_all_success (int32_t *pending[], int child_count, } } +void +_set_all_child_errno (int *child_errno, unsigned int child_count) +{ + int i = 0; + + for (i = 0; i < child_count; i++) + if (child_errno[i] == 0) + child_errno[i] = ENOTCONN; +} + +void +afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + fd_t *fd = NULL; + + local = frame->local; + priv = this->private; + fd = local->fd; + + __mark_all_success (local->pending, priv->child_count, + local->transaction.type); + + _set_all_child_errno (local->child_errno, priv->child_count); + + /* Perform fops with the lk-owner from top xlator. + * Eg: lk-owner of posix-lk and flush should be same, + * flush cant clear the posix-lks without that lk-owner. + */ + afr_save_lk_owner (frame); + frame->root->lk_owner = + local->transaction.main_frame->root->lk_owner; + + + /* The wake up needs to happen independent of + what type of fop arrives here. If it was + a write, then it has already inherited the + lock and changelog. If it was not a write, + then the presumption of the optimization (of + optimizing for successive write operations) + fails. + */ + if (fd) + afr_delayed_changelog_wake_up (this, fd); + local->transaction.fop (frame, this); +} + static int __changelog_enabled (afr_private_t *priv, afr_transaction_type type) @@ -246,64 +285,42 @@ __fop_changelog_needed (call_frame_t *frame, xlator_t *this) return op_ret; } - -static int -afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending) +int +afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, + int child, afr_xattrop_type_t op) { int i = 0; int ret = 0; + if (op == LOCAL_FIRST) { + ret = dict_set_static_bin (xattr, priv->pending_key[child], + pending[child], + AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); + if (ret) + goto out; + } for (i = 0; i < priv->child_count; i++) { + if (i == child) + continue; ret = dict_set_static_bin (xattr, priv->pending_key[i], - pending[i], 3 * sizeof (int32_t)); + pending[i], + AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); /* 3 = data+metadata+entry */ if (ret < 0) goto out; } - -out: - return ret; -} - - -static int -afr_set_piggyback_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, - afr_transaction_type type) -{ - int i = 0; - int ret = 0; - int *arr = NULL; - int index = 0; - size_t pending_xattr_size = 3 * sizeof (int32_t); - /* 3 = data+metadata+entry */ - - index = afr_index_for_transaction_type (type); - - for (i = 0; i < priv->child_count; i++) { - arr = GF_CALLOC (1, pending_xattr_size, - gf_afr_mt_char); - if (!arr) { - ret = -1; - goto out; - } - - memcpy (arr, pending[i], pending_xattr_size); - - arr[index] = hton32 (ntoh32(arr[index]) + 1); - - ret = dict_set_bin (xattr, priv->pending_key[i], - arr, pending_xattr_size); - - if (ret < 0) + if (op == LOCAL_LAST) { + ret = dict_set_static_bin (xattr, priv->pending_key[child], + pending[child], + AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); + if (ret) goto out; } - out: return ret; } - int afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) { @@ -331,7 +348,8 @@ afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) int32_t afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) { afr_internal_lock_t *int_lock = NULL; afr_private_t *priv = NULL; @@ -349,6 +367,11 @@ afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, UNLOCK (&frame->lock); if (call_count == 0) { + if (local->transaction.resume_stub) { + call_resume (local->transaction.resume_stub); + local->transaction.resume_stub = NULL; + } + if (afr_lock_server_count (priv, local->transaction.type) == 0) { local->transaction.done (frame, this); } else { @@ -382,63 +405,74 @@ afr_transaction_rm_stale_children (call_frame_t *frame, xlator_t *this, local = frame->local; pending = local->pending; - stale_children = afr_children_create (priv->child_count); - if (!stale_children) + if (local->op_ret < 0) goto out; - fresh_children = local->fresh_children; read_child = afr_inode_get_read_ctx (this, inode, fresh_children); - - GF_ASSERT (read_child >= 0); - - if (pending[read_child][idx] == 0) - read_child = -1; + if (read_child < 0) { + gf_log (this->name, GF_LOG_DEBUG, "Possible split-brain " + "for %s", uuid_utoa (inode->gfid)); + goto out; + } for (i = 0; i < priv->child_count; i++) { if (!afr_is_child_present (fresh_children, priv->child_count, i)) continue; - if (pending[i][idx] == 0) { - /* child is down or op failed on it */ - rm_stale_children = _gf_true; - afr_children_rm_child (fresh_children, i, - priv->child_count); - stale_children[count++] = i; - } - } + if (pending[i][idx]) + continue; + /* child is down or op failed on it */ + if (!stale_children) + stale_children = afr_children_create (priv->child_count); + if (!stale_children) + goto out; - if (!rm_stale_children) { - GF_ASSERT (read_child >= 0); - goto out; + rm_stale_children = _gf_true; + stale_children[count++] = i; + gf_log (this->name, GF_LOG_DEBUG, "Removing stale child " + "%d for %s", i, uuid_utoa (inode->gfid)); } - if (fresh_children[0] == -1) { - //All children failed. leave as-is + if (!rm_stale_children) goto out; - } - if (read_child == -1) - read_child = fresh_children[0]; - afr_inode_rm_stale_children (this, inode, read_child, stale_children); + afr_inode_rm_stale_children (this, inode, stale_children); out: - if (stale_children) - GF_FREE (stale_children); + GF_FREE (stale_children); return; } +afr_inodelk_t* +afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom) +{ + afr_inodelk_t *inodelk = NULL; + int i = 0; + + for (i = 0; int_lock->inodelk[i].domain; i++) { + inodelk = &int_lock->inodelk[i]; + if (strcmp (dom, inodelk->domain) == 0) + return inodelk; + } + return NULL; +} + unsigned char* afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock) { unsigned char *locked_nodes = NULL; + afr_inodelk_t *inodelk = NULL; switch (type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - locked_nodes = int_lock->inode_locked_nodes; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + locked_nodes = inodelk->locked_nodes; break; case AFR_ENTRY_TRANSACTION: case AFR_ENTRY_RENAME_TRANSACTION: - locked_nodes = int_lock->entry_locked_nodes; + /*Because same set of subvols participate in all lockee + * entities*/ + locked_nodes = int_lock->lockee[0].locked_nodes; break; } return locked_nodes; @@ -456,19 +490,161 @@ afr_changelog_pre_op_call_count (afr_transaction_type type, GF_ASSERT (locked_nodes); call_count = afr_locked_children_count (locked_nodes, child_count); - if (type == AFR_ENTRY_RENAME_TRANSACTION) { + if (type == AFR_ENTRY_RENAME_TRANSACTION) call_count *= 2; - } return call_count; } int -afr_changelog_post_op (call_frame_t *frame, xlator_t *this) +afr_changelog_post_op_call_count (afr_transaction_type type, + unsigned char *pre_op, + unsigned int child_count) +{ + int call_count = 0; + + call_count = afr_pre_op_done_children_count (pre_op, child_count); + if (type == AFR_ENTRY_RENAME_TRANSACTION) + call_count *= 2; + + return call_count; +} + +void +afr_compute_txn_changelog (afr_local_t *local, afr_private_t *priv) +{ + int i = 0; + int index = 0; + int32_t postop = 0; + int32_t preop = 1; + int32_t **txn_changelog = NULL; + + txn_changelog = local->transaction.txn_changelog; + index = afr_index_for_transaction_type (local->transaction.type); + for (i = 0; i < priv->child_count; i++) { + postop = ntoh32 (local->pending[i][index]); + txn_changelog[i][index] = hton32 (postop + preop); + } +} + +afr_xattrop_type_t +afr_get_postop_xattrop_type (int32_t **pending, int optimized, int child, + afr_transaction_type type) +{ + int index = 0; + afr_xattrop_type_t op = LOCAL_LAST; + + index = afr_index_for_transaction_type (type); + if (optimized && !pending[child][index]) + op = LOCAL_FIRST; + return op; +} + +void +afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr, + int optimized, int child) +{ + int32_t **txn_changelog = NULL; + int32_t **changelog = NULL; + afr_private_t *priv = NULL; + int ret = 0; + afr_xattrop_type_t op = LOCAL_LAST; + + priv = this->private; + txn_changelog = local->transaction.txn_changelog; + op = afr_get_postop_xattrop_type (local->pending, optimized, child, + local->transaction.type); + if (optimized) + changelog = txn_changelog; + else + changelog = local->pending; + ret = afr_set_pending_dict (priv, xattr, changelog, child, op); + if (ret < 0) + gf_log (this->name, GF_LOG_INFO, + "failed to set pending entry"); +} + + +gf_boolean_t +afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int index = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + index = afr_index_for_transaction_type (local->transaction.type); + + for (i = 0; i < priv->child_count; i++) { + if (local->pending[i][index] == 0) + return _gf_false; + } + + return _gf_true; +} + +static void +afr_dir_fop_handle_all_fop_failures (call_frame_t *frame) +{ + xlator_t *this = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + this = frame->this; + local = frame->local; + priv = this->private; + + if ((local->transaction.type != AFR_ENTRY_TRANSACTION) && + (local->transaction.type != AFR_ENTRY_RENAME_TRANSACTION)) + return; + + if (local->op_ret >= 0) + goto out; + + __mark_all_success (local->pending, priv->child_count, + local->transaction.type); +out: + return; +} + +static void +afr_data_handle_quota_errors (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + gf_boolean_t all_quota_failures = _gf_false; + + local = frame->local; + priv = this->private; + if (local->transaction.type != AFR_DATA_TRANSACTION) + return; + /* + * Idea is to not leave the file in FOOL-FOOL scenario in case on + * all the bricks data transaction failed with EDQUOT to avoid + * increasing un-necessary load of self-heals in the system. + */ + all_quota_failures = _gf_true; + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] && + (local->child_errno[i] != EDQUOT)) { + all_quota_failures = _gf_false; + break; + } + } + if (all_quota_failures) + __mark_all_success (local->pending, priv->child_count, + local->transaction.type); +} + +int +afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) { afr_private_t * priv = this->private; afr_internal_lock_t *int_lock = NULL; - int ret = 0; int i = 0; int call_count = 0; @@ -476,14 +652,17 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) afr_fd_ctx_t *fdctx = NULL; dict_t **xattr = NULL; int piggyback = 0; - int index = 0; int nothing_failed = 1; local = frame->local; int_lock = &local->internal_lock; - __mark_down_children (local->pending, priv->child_count, - local->child_up, local->transaction.type); + __mark_non_participant_children (local->pending, priv->child_count, + local->transaction.pre_op, + local->transaction.type); + + afr_data_handle_quota_errors (frame, this); + afr_dir_fop_handle_all_fop_failures (frame); if (local->fd) afr_transaction_rm_stale_children (frame, this, @@ -496,8 +675,9 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) xattr[i] = dict_new (); } - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); + call_count = afr_changelog_post_op_call_count (local->transaction.type, + local->transaction.pre_op, + priv->child_count); local->call_count = call_count; if (local->fd) @@ -510,82 +690,65 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) goto out; } - /* check if something has failed, to handle piggybacking */ - nothing_failed = 1; - index = afr_index_for_transaction_type (local->transaction.type); - for (i = 0; i < priv->child_count; i++) { - if (local->pending[i][index] == 0) { - nothing_failed = 0; - break; - } - } + nothing_failed = afr_txn_nothing_failed (frame, this); - index = afr_index_for_transaction_type (local->transaction.type); - if (local->optimistic_change_log && - local->transaction.type != AFR_DATA_TRANSACTION) { - /* if nothing_failed, then local->pending[..] == {0 .. 0} */ - for (i = 0; i < priv->child_count; i++) - local->pending[i][index]++; - } + afr_compute_txn_changelog (local , priv); for (i = 0; i < priv->child_count; i++) { if (!local->transaction.pre_op[i]) continue; - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); - + if (local->transaction.type != AFR_DATA_TRANSACTION) + afr_set_postop_dict (local, this, xattr[i], + local->optimistic_change_log, i); switch (local->transaction.type) { case AFR_DATA_TRANSACTION: { if (!fdctx) { + afr_set_postop_dict (local, this, xattr[i], + 0, i); STACK_WIND (frame, afr_changelog_post_op_cbk, priv->children[i], priv->children[i]->fops->xattrop, &local->loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); break; } - LOCK (&local->fd->lock); - { - piggyback = 0; - if (fdctx->pre_op_piggyback[i]) { - fdctx->pre_op_piggyback[i]--; - piggyback = 1; - } - } - UNLOCK (&local->fd->lock); + /* local->transaction.postop_piggybacked[] was + precomputed in is_piggyback_postop() when called from + afr_changelog_post_op_safe() + */ + + piggyback = 0; + if (local->transaction.postop_piggybacked[i]) + piggyback = 1; - if (piggyback && !nothing_failed) - ret = afr_set_piggyback_dict (priv, xattr[i], - local->pending, - local->transaction.type); + afr_set_postop_dict (local, this, xattr[i], + piggyback, i); if (nothing_failed && piggyback) { afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); + this, 1, 0, xattr[i], NULL); } else { - __mark_pre_op_undone_on_fd (frame, this, i); STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->fxattrop, local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); } } break; case AFR_METADATA_TRANSACTION: { - if (nothing_failed) { + if (nothing_failed && local->optimistic_change_log) { afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); + this, 1, 0, xattr[i], + NULL); break; } @@ -594,28 +757,32 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->fxattrop, local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); else STACK_WIND (frame, afr_changelog_post_op_cbk, priv->children[i], priv->children[i]->fops->xattrop, &local->loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); } break; case AFR_ENTRY_RENAME_TRANSACTION: { - if (nothing_failed) { + if (nothing_failed && local->optimistic_change_log) { afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); + this, 1, 0, xattr[i], + NULL); } else { STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->xattrop, &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); } call_count--; } @@ -628,20 +795,17 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) value */ - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); + afr_set_postop_dict (local, this, xattr[i], + local->optimistic_change_log, i); /* fall through */ case AFR_ENTRY_TRANSACTION: { - if (nothing_failed) { + if (nothing_failed && local->optimistic_change_log) { afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); + this, 1, 0, xattr[i], + NULL); break; } @@ -650,13 +814,15 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->fxattrop, local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); else STACK_WIND (frame, afr_changelog_post_op_cbk, priv->children[i], priv->children[i]->fops->xattrop, &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); } break; } @@ -676,7 +842,8 @@ out: int32_t afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) { afr_local_t * local = NULL; afr_private_t * priv = this->private; @@ -721,12 +888,7 @@ afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, (local->op_errno == ENOTSUP)) { local->transaction.resume (frame, this); } else { - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); - - afr_pid_restore (frame); - - local->transaction.fop (frame, this); + afr_transaction_perform_fop (frame, this); } } @@ -779,8 +941,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) for (i = 0; i < priv->child_count; i++) { if (!locked_nodes[i]) continue; - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); + ret = afr_set_pending_dict (priv, xattr[i], local->pending, + i, LOCAL_FIRST); if (ret < 0) gf_log (this->name, GF_LOG_INFO, @@ -797,7 +959,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->xattrop, &(local->loc), - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); break; } @@ -814,9 +977,12 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) } UNLOCK (&local->fd->lock); + afr_set_delayed_post_op (frame, this); + if (piggyback) afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); + this, 1, 0, xattr[i], + NULL); else STACK_WIND_COOKIE (frame, afr_changelog_pre_op_cbk, @@ -824,14 +990,16 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->fxattrop, local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); } break; case AFR_METADATA_TRANSACTION: { if (local->optimistic_change_log) { afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); + this, 1, 0, xattr[i], + NULL); break; } @@ -842,7 +1010,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->fxattrop, local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); else STACK_WIND_COOKIE (frame, afr_changelog_pre_op_cbk, @@ -850,7 +1019,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->xattrop, &(local->loc), - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); } break; @@ -858,7 +1028,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) { if (local->optimistic_change_log) { afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); + this, 1, 0, xattr[i], + NULL); } else { STACK_WIND_COOKIE (frame, afr_changelog_pre_op_cbk, @@ -866,7 +1037,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->xattrop, &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); } call_count--; @@ -881,8 +1053,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) value */ - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); + ret = afr_set_pending_dict (priv, xattr[i], local->pending, + i, LOCAL_FIRST); if (ret < 0) gf_log (this->name, GF_LOG_INFO, @@ -894,7 +1066,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) { if (local->optimistic_change_log) { afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); + this, 1, 0, xattr[i], + NULL); break; } @@ -905,7 +1078,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->fxattrop, local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); else STACK_WIND_COOKIE (frame, afr_changelog_pre_op_cbk, @@ -913,7 +1087,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->xattrop, &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); + GF_XATTROP_ADD_ARRAY, xattr[i], + NULL); } break; } @@ -1075,12 +1250,14 @@ int afr_set_transaction_flock (afr_local_t *local) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; int_lock = &local->internal_lock; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - int_lock->lk_flock.l_len = local->transaction.len; - int_lock->lk_flock.l_start = local->transaction.start; - int_lock->lk_flock.l_type = F_WRLCK; + inodelk->flock.l_len = local->transaction.len; + inodelk->flock.l_start = local->transaction.start; + inodelk->flock.l_type = F_WRLCK; return 0; } @@ -1095,6 +1272,7 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; int_lock->transaction_lk_type = AFR_TRANSACTION_LK; + int_lock->domain = this->name; switch (local->transaction.type) { case AFR_DATA_TRANSACTION: @@ -1108,8 +1286,8 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this) case AFR_ENTRY_RENAME_TRANSACTION: - int_lock->lock_cbk = afr_post_blocking_rename_cbk; - afr_blocking_lock (frame, this); + int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk; + afr_nonblocking_entrylk (frame, this); break; case AFR_ENTRY_TRANSACTION: @@ -1131,12 +1309,6 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this) int afr_lock (call_frame_t *frame, xlator_t *this) { - afr_pid_save (frame); - - frame->root->pid = (long) frame->root; - - afr_set_lk_owner (frame, this); - afr_set_lock_number (frame, this); return afr_lock_rec (frame, this); @@ -1148,28 +1320,463 @@ afr_lock (call_frame_t *frame, xlator_t *this) int afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + if (__fop_changelog_needed (frame, this)) { + afr_changelog_pre_op (frame, this); + } else { + afr_transaction_perform_fop (frame, this); + } + + return 0; +} + + +void +afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + /* call this function from any of the related optimizations + which benefit from delaying post op are enabled, namely: + + - changelog piggybacking + - eager locking + */ + + priv = this->private; + if (!priv) + return; + + if (!priv->post_op_delay_secs) + return; + + local = frame->local; + if (!local->transaction.eager_lock_on) + return; + + if (!local) + return; + + if (!local->fd) + return; + + if (local->op == GF_FOP_WRITE) + local->delayed_post_op = _gf_true; +} + +gf_boolean_t +afr_are_multiple_fds_opened (inode_t *inode, xlator_t *this) +{ + afr_inode_ctx_t *ictx = NULL; + + if (!inode) { + /* If false is returned, it may keep on taking eager-lock + * which may lead to starvation, so return true to avoid that. + */ + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid inode"); + return _gf_true; + } + /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock + * is taken mount2 opened the same file, it won't be able to + * perform any data operations until mount1 releases eager-lock. + * To avoid such scenario do not enable eager-lock for this transaction + * if open-fd-count is > 1 + */ + + ictx = afr_inode_ctx_get (inode, this); + if (!ictx) + return _gf_true; + + if (ictx->open_fd_count > 1) + return _gf_true; + + return _gf_false; +} + +gf_boolean_t +afr_any_fops_failed (afr_local_t *local, afr_private_t *priv) +{ + if (local->success_count != priv->child_count) + return _gf_true; + return _gf_false; +} + +gf_boolean_t +is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + gf_boolean_t res = _gf_false; + afr_private_t *priv = NULL; priv = this->private; + local = frame->local; + if (!local) + goto out; - if (__fop_changelog_needed (frame, this)) { - afr_changelog_pre_op (frame, this); - } else { - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); + if (!local->delayed_post_op) + goto out; + + //Mark pending changelog ASAP + if (afr_any_fops_failed (local, priv)) + goto out; + + if (local->fd && afr_are_multiple_fds_opened (local->fd->inode, this)) + goto out; + + res = _gf_true; +out: + return res; +} + + +void +afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, + call_stub_t *stub); + +void +afr_delayed_changelog_wake_up_cbk (void *data) +{ + fd_t *fd = NULL; + + fd = data; - afr_pid_restore (frame); + afr_delayed_changelog_wake_up (THIS, fd); +} + + +/* + Check if the frame is destined to get optimized away + with changelog piggybacking +*/ +static gf_boolean_t +is_piggyback_post_op (call_frame_t *frame, fd_t *fd) +{ + afr_fd_ctx_t *fdctx = NULL; + afr_local_t *local = NULL; + gf_boolean_t piggyback = _gf_true; + afr_private_t *priv = NULL; + int i = 0; + + priv = frame->this->private; + local = frame->local; + fdctx = afr_fd_ctx_get (fd, frame->this); - local->transaction.fop (frame, this); + LOCK(&fd->lock); + { + piggyback = _gf_true; + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; + if (fdctx->pre_op_piggyback[i]) { + fdctx->pre_op_piggyback[i]--; + local->transaction.postop_piggybacked[i] = 1; + } else { + /* For at least _one_ subvolume we cannot + piggyback on the changelog, and have to + perform a hard POST-OP and therefore fsync + if necesssary + */ + piggyback = _gf_false; + GF_ASSERT (fdctx->pre_op_done[i]); + fdctx->pre_op_done[i]--; + } + } } + UNLOCK(&fd->lock); + + if (!afr_txn_nothing_failed (frame, frame->this)) { + /* something failed in this transaction, + we will be performing a hard post-op + */ + return _gf_false; + } + + return piggyback; +} + + +/* SET operation */ +int +afr_fd_report_unstable_write (xlator_t *this, fd_t *fd) +{ + afr_fd_ctx_t *fdctx = NULL; + + fdctx = afr_fd_ctx_get (fd, this); + + LOCK(&fd->lock); + { + fdctx->witnessed_unstable_write = _gf_true; + } + UNLOCK(&fd->lock); + + return 0; +} + +/* TEST and CLEAR operation */ +gf_boolean_t +afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd) +{ + afr_fd_ctx_t *fdctx = NULL; + gf_boolean_t witness = _gf_false; + + fdctx = afr_fd_ctx_get (fd, this); + if (!fdctx) + return _gf_true; + + LOCK(&fd->lock); + { + if (fdctx->witnessed_unstable_write) { + witness = _gf_true; + fdctx->witnessed_unstable_write = _gf_false; + } + } + UNLOCK (&fd->lock); + + return witness; +} + + +int +afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + afr_private_t *priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (afr_fop_failed (op_ret, op_errno)) { + /* Failure of fsync() is as good as failure of previous + write(). So treat it like one. + */ + gf_log (this->name, GF_LOG_WARNING, + "fsync(%s) failed on subvolume %s. Transaction was %s", + uuid_utoa (local->fd->inode->gfid), + priv->children[child_index]->name, + gf_fop_list[local->op]); + + afr_transaction_fop_failed (frame, this, child_index); + } + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_changelog_post_op_now (frame, this); return 0; } int +afr_changelog_fsync (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + GF_UNUSED int ret = -1; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (!call_count) { + /* will go straight to unlock */ + afr_changelog_post_op_now (frame, this); + return 0; + } + + local->call_count = call_count; + + xdata = dict_new(); + if (xdata) + ret = dict_set_int32 (xdata, "batch-fsync", 1); + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; + + STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->fsync, local->fd, + 1, xdata); + if (!--call_count) + break; + } + + if (xdata) + dict_unref (xdata); + + return 0; +} + + + int +afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) { + afr_changelog_post_op_now (frame, this); + return 0; + } + + if (is_piggyback_post_op (frame, local->fd)) { + /* just detected that this post-op is about to + be optimized away as a new write() has + already piggybacked on this frame's changelog. + */ + afr_changelog_post_op_now (frame, this); + return 0; + } + + /* Calling afr_changelog_post_op_now() now will result in + issuing ->[f]xattrop(). + + Performing a hard POST-OP (->[f]xattrop() FOP) is a more + responsible operation that what it might appear on the surface. + + The changelog of a file (in the xattr of the file on the server) + stores information (pending count) about the state of the file + on the OTHER server. This changelog is blindly trusted, and must + therefore be updated in such a way it remains trustworthy. This + implies that decrementing the pending count (essentially "clearing + the dirty flag") must be done STRICTLY after we are sure that the + operation on the other server has reached stable storage. + + While the backend filesystem on that server will eventually flush + it to stable storage, we (being in userspace) have no mechanism + to get notified when the write became "stable". + + This means we need take matter into our own hands and issue an + fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES, + and get an acknowledgement for it. And we need to wait for the + fsync() acknowledgement before initiating the hard POST-OP. + + However if the FD itself was opened in O_SYNC or O_DSYNC then + we are already guaranteed that the writes were made stable as + part of the FOP itself. The same holds true for NFS stable + writes which happen on an anonymous FD with O_DSYNC or O_SYNC + flag set in the writev() @flags param. For all other write types, + mark a flag in the fdctx whenever an unstable write is witnessed. + */ + + if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) { + afr_changelog_post_op_now (frame, this); + return 0; + } + + /* Check whether users want durability and perform fsync/post-op + * accordingly. + */ + if (priv->ensure_durability) { + /* Time to fsync() */ + afr_changelog_fsync (frame, this); + } else { + afr_changelog_post_op_now (frame, this); + } + + return 0; +} + + +void +afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, + call_stub_t *stub) +{ + afr_fd_ctx_t *fd_ctx = NULL; + call_frame_t *prev_frame = NULL; + struct timespec delta = {0, }; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + goto out; + + delta.tv_sec = priv->post_op_delay_secs; + delta.tv_nsec = 0; + + pthread_mutex_lock (&fd_ctx->delay_lock); + { + prev_frame = fd_ctx->delay_frame; + fd_ctx->delay_frame = NULL; + if (fd_ctx->delay_timer) + gf_timer_call_cancel (this->ctx, fd_ctx->delay_timer); + fd_ctx->delay_timer = NULL; + if (!frame) + goto unlock; + fd_ctx->delay_timer = gf_timer_call_after (this->ctx, delta, + afr_delayed_changelog_wake_up_cbk, + fd); + fd_ctx->delay_frame = frame; + } +unlock: + pthread_mutex_unlock (&fd_ctx->delay_lock); + +out: + if (prev_frame) { + local = prev_frame->local; + local->transaction.resume_stub = stub; + afr_changelog_post_op_safe (prev_frame, this); + } else if (stub) { + call_resume (stub); + } +} + + +void +afr_changelog_post_op (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (is_afr_delayed_changelog_post_op_needed (frame, this)) + afr_delayed_changelog_post_op (this, frame, local->fd, NULL); + else + afr_changelog_post_op_safe (frame, this); +} + + + +/* Wake up the sleeping/delayed post-op, and also register + a stub to have it resumed after this transaction + completely finishes. + + The @stub gets saved in @local and gets resumed in + afr_local_cleanup() + */ +void +afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub) +{ + afr_delayed_changelog_post_op (this, NULL, fd, stub); +} + + +void +afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd) +{ + afr_delayed_changelog_post_op (this, NULL, fd, NULL); +} + + + int afr_transaction_resume (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; @@ -1180,6 +1787,19 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; priv = this->private; + if (local->transaction.eager_lock_on) { + /* We don't need to retain "local" in the + fd list anymore, writes to all subvols + are finished by now */ + LOCK (&local->fd->lock); + { + list_del_init (&local->transaction.eager_locked); + } + UNLOCK (&local->fd->lock); + } + + afr_restore_lk_owner (frame); + if (__fop_changelog_needed (frame, this)) { afr_changelog_post_op (frame, this); } else { @@ -1200,7 +1820,8 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this) */ void -afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index) +afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, + int child_index) { afr_local_t * local = NULL; afr_private_t * priv = NULL; @@ -1209,7 +1830,89 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index priv = this->private; __mark_child_dead (local->pending, priv->child_count, - child_index, local->transaction.type); + child_index, local->transaction.type); +} + + + + static gf_boolean_t +afr_locals_overlap (afr_local_t *local1, afr_local_t *local2) +{ + uint64_t start1 = local1->transaction.start; + uint64_t start2 = local2->transaction.start; + uint64_t end1 = 0; + uint64_t end2 = 0; + + if (local1->transaction.len) + end1 = start1 + local1->transaction.len - 1; + else + end1 = ULLONG_MAX; + + if (local2->transaction.len) + end2 = start2 + local2->transaction.len - 1; + else + end2 = ULLONG_MAX; + + return ((end1 >= start2) && (end2 >= start1)); +} + +void +afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_fd_ctx_t *fdctx = NULL; + afr_local_t *each = NULL; + + priv = this->private; + + if (!local->fd) + return; + + if (local->transaction.type != AFR_DATA_TRANSACTION) + return; + + if (!priv->eager_lock) + return; + + fdctx = afr_fd_ctx_get (local->fd, this); + if (!fdctx) + return; + + if (afr_are_multiple_fds_opened (local->fd->inode, this)) + return; + /* + * Once full file lock is acquired in eager-lock phase, overlapping + * writes do not compete for inode-locks, instead are transferred to the + * next writes. Because of this overlapping writes are not ordered. + * This can cause inconsistencies in replication. + * Example: + * Two overlapping writes w1, w2 are sent in parallel on same fd + * in two threads t1, t2. + * Both threads can execute afr_writev_wind in the following manner. + * t1 winds w1 on brick-0 + * t2 winds w2 on brick-0 + * t2 winds w2 on brick-1 + * t1 winds w1 on brick-1 + * + * This check makes sure the locks are not transferred for + * overlapping writes. + */ + LOCK (&local->fd->lock); + { + list_for_each_entry (each, &fdctx->eager_locked, + transaction.eager_locked) { + if (afr_locals_overlap (each, local)) { + local->transaction.eager_lock_on = _gf_false; + goto unlock; + } + } + + local->transaction.eager_lock_on = _gf_true; + list_add_tail (&local->transaction.eager_locked, + &fdctx->eager_locked); + } +unlock: + UNLOCK (&local->fd->lock); } @@ -1218,20 +1921,43 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) { afr_local_t * local = NULL; afr_private_t * priv = NULL; + fd_t *fd = NULL; + int ret = -1; local = frame->local; priv = this->private; - afr_transaction_local_init (local, this); - local->transaction.resume = afr_transaction_resume; local->transaction.type = type; + ret = afr_transaction_local_init (local, this); + if (ret < 0) + goto out; + + afr_transaction_eager_lock_init (local, this); + + if (local->fd && local->transaction.eager_lock_on) + afr_set_lk_owner (frame, this, local->fd); + else + afr_set_lk_owner (frame, this, frame->root); + + if (!local->transaction.eager_lock_on && local->loc.inode) { + fd = fd_lookup (local->loc.inode, frame->root->pid); + if (fd == NULL) + fd = fd_lookup_anonymous (local->loc.inode); + + if (fd) { + afr_delayed_changelog_wake_up (this, fd); + fd_unref (fd); + } + } + if (afr_lock_server_count (priv, local->transaction.type) == 0) { afr_internal_lock_finish (frame, this); } else { afr_lock (frame, this); } - - return 0; + ret = 0; +out: + return ret; } diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index 10f274fec..fa626fd0d 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -1,25 +1,21 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __TRANSACTION_H__ #define __TRANSACTION_H__ +typedef enum { + LOCAL_FIRST = 1, + LOCAL_LAST = 2 +} afr_xattrop_type_t; + void afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index); @@ -27,9 +23,29 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int afr_lock_server_count (afr_private_t *priv, afr_transaction_type type); +afr_inodelk_t* +afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom); + int32_t afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); afr_fd_ctx_t * afr_fd_ctx_get (fd_t *fd, xlator_t *this); +int +afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, + int child, afr_xattrop_type_t op); +void +afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this); + +void +afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd); + +void +__mark_all_success (int32_t *pending[], int child_count, + afr_transaction_type type); +gf_boolean_t +afr_any_fops_failed (afr_local_t *local, afr_private_t *priv); + +gf_boolean_t +afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this); #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 30da3fc72..c724eb2ae 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -30,6 +21,11 @@ #endif #include "afr-common.c" +#define SHD_INODE_LRU_LIMIT 2048 +#define AFR_EH_HEALED_LIMIT 1024 +#define AFR_EH_HEAL_FAIL_LIMIT 1024 +#define AFR_EH_SPLIT_BRAIN_LIMIT 1024 + struct volume_options options[]; int32_t @@ -37,8 +33,13 @@ notify (xlator_t *this, int32_t event, void *data, ...) { int ret = -1; + va_list ap; + void *data2 = NULL; - ret = afr_notify (this, event, data); + va_start (ap, data); + data2 = va_arg (ap, dict_t*); + va_end (ap); + ret = afr_notify (this, event, data, data2); return ret; } @@ -85,26 +86,31 @@ xlator_subvolume_index (xlator_t *this, xlator_t *subvol) return index; } - -int -xlator_subvolume_count (xlator_t *this) +void +fix_quorum_options (xlator_t *this, afr_private_t *priv, char *qtype) { - int i = 0; - xlator_list_t *list = NULL; - - for (list = this->children; list; list = list->next) - i++; - return i; + if (priv->quorum_count && strcmp(qtype,"fixed")) { + gf_log(this->name,GF_LOG_WARNING, + "quorum-type %s overriding quorum-count %u", + qtype, priv->quorum_count); + } + if (!strcmp(qtype,"none")) { + priv->quorum_count = 0; + } + else if (!strcmp(qtype,"auto")) { + priv->quorum_count = AFR_QUORUM_AUTO; + } } - int reconfigure (xlator_t *this, dict_t *options) { - afr_private_t * priv = NULL; - xlator_t * read_subvol = NULL; - int ret = -1; - int index = -1; + afr_private_t *priv = NULL; + xlator_t *read_subvol = NULL; + int read_subvol_index = -1; + int ret = -1; + int index = -1; + char *qtype = NULL; priv = this->private; @@ -144,6 +150,9 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out); + GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode, + options, uint32, out); + if (read_subvol) { index = xlator_subvolume_index (this, read_subvol); if (index == -1) { @@ -154,6 +163,38 @@ reconfigure (xlator_t *this, dict_t *options) priv->read_child = index; } + GF_OPTION_RECONF ("read-subvolume-index",read_subvol_index, options,int32,out); + + if (read_subvol_index >-1) { + index=read_subvol_index; + if (index >= priv->child_count) { + gf_log (this->name, GF_LOG_ERROR, "%d not a subvolume-index", + index); + goto out; + } + priv->read_child = index; + } + + GF_OPTION_RECONF ("eager-lock", priv->eager_lock, options, bool, out); + GF_OPTION_RECONF ("quorum-type", qtype, options, str, out); + GF_OPTION_RECONF ("quorum-count", priv->quorum_count, options, + uint32, out); + fix_quorum_options(this,priv,qtype); + GF_OPTION_RECONF ("heal-timeout", priv->shd.timeout, options, + int32, out); + + GF_OPTION_RECONF ("post-op-delay-secs", priv->post_op_delay_secs, options, + uint32, out); + + GF_OPTION_RECONF (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, + options, size, out); + /* Reset this so we re-discover in case the topology changed. */ + GF_OPTION_RECONF ("readdir-failover", priv->readdir_failover, options, + bool, out); + GF_OPTION_RECONF ("ensure-durability", priv->ensure_durability, options, + bool, out); + priv->did_discovery = _gf_false; + ret = 0; out: return ret; @@ -173,15 +214,16 @@ static const char *favorite_child_warning_str = "You have specified subvolume '% int32_t init (xlator_t *this) { - afr_private_t * priv = NULL; - int child_count = 0; - xlator_list_t * trav = NULL; - int i = 0; - int ret = -1; - GF_UNUSED int op_errno = 0; - xlator_t * read_subvol = NULL; - xlator_t * fav_child = NULL; - + afr_private_t *priv = NULL; + int child_count = 0; + xlator_list_t *trav = NULL; + int i = 0; + int ret = -1; + GF_UNUSED int op_errno = 0; + xlator_t *read_subvol = NULL; + int read_subvol_index = -1; + xlator_t *fav_child = NULL; + char *qtype = NULL; if (!this->children) { gf_log (this->name, GF_LOG_ERROR, @@ -195,9 +237,21 @@ init (xlator_t *this) "Volume is dangling."); } - ALLOC_OR_GOTO (this->private, afr_private_t, out); + this->private = GF_CALLOC (1, sizeof (afr_private_t), + gf_afr_mt_afr_private_t); + if (!this->private) + goto out; priv = this->private; + LOCK_INIT (&priv->lock); + LOCK_INIT (&priv->read_child_lock); + //lock recovery is not done in afr + pthread_mutex_init (&priv->mutex, NULL); + INIT_LIST_HEAD (&priv->saved_fds); + + child_count = xlator_subvolume_count (this); + + priv->child_count = child_count; priv->read_child = -1; @@ -210,6 +264,18 @@ init (xlator_t *this) goto out; } } + GF_OPTION_INIT ("read-subvolume-index",read_subvol_index,int32,out); + if (read_subvol_index > -1) { + if (read_subvol_index >= priv->child_count) { + gf_log (this->name, GF_LOG_ERROR, "%d not a subvolume-index", + read_subvol_index); + goto out; + } + priv->read_child = read_subvol_index; + } + GF_OPTION_INIT ("choose-local", priv->choose_local, bool, out); + + GF_OPTION_INIT ("read-hash-mode", priv->hash_mode, uint32, out); priv->favorite_child = -1; GF_OPTION_INIT ("favorite-child", fav_child, xlator, out); @@ -244,6 +310,8 @@ init (xlator_t *this) GF_OPTION_INIT ("self-heal-daemon", priv->shd.enabled, bool, out); + GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out); + GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out); GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool, @@ -260,14 +328,19 @@ init (xlator_t *this) GF_OPTION_INIT ("strict-readdir", priv->strict_readdir, bool, out); - priv->wait_count = 1; - - child_count = xlator_subvolume_count (this); + GF_OPTION_INIT ("eager-lock", priv->eager_lock, bool, out); + GF_OPTION_INIT ("quorum-type", qtype, str, out); + GF_OPTION_INIT ("quorum-count", priv->quorum_count, uint32, out); + GF_OPTION_INIT (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size, + out); + fix_quorum_options(this,priv,qtype); - priv->child_count = child_count; + GF_OPTION_INIT ("post-op-delay-secs", priv->post_op_delay_secs, uint32, out); + GF_OPTION_INIT ("readdir-failover", priv->readdir_failover, bool, out); + GF_OPTION_INIT ("ensure-durability", priv->ensure_durability, bool, + out); - LOCK_INIT (&priv->lock); - LOCK_INIT (&priv->read_child_lock); + priv->wait_count = 1; priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, gf_afr_mt_char); @@ -307,8 +380,6 @@ init (xlator_t *this) AFR_XATTR_PREFIX, trav->xlator->name); if (-1 == ret) { - gf_log (this->name, GF_LOG_ERROR, - "asprintf failed to set pending key"); ret = -ENOMEM; goto out; } @@ -317,6 +388,13 @@ init (xlator_t *this) i++; } + ret = gf_asprintf (&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT, + this->name); + if (-1 == ret) { + ret = -ENOMEM; + goto out; + } + priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event), gf_afr_mt_int32_t); if (!priv->last_event) { @@ -324,20 +402,67 @@ init (xlator_t *this) goto out; } - priv->shd.pos = GF_CALLOC (sizeof (*priv->shd.pos), child_count, - gf_afr_mt_afr_brick_pos_t); - if (!priv->shd.pos) { - ret = -ENOMEM; + /* keep more local here as we may need them for self-heal etc */ + this->local_pool = mem_pool_new (afr_local_t, 512); + if (!this->local_pool) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); goto out; } - LOCK_INIT (&priv->root_inode_lk); priv->first_lookup = 1; priv->root_inode = NULL; - pthread_mutex_init (&priv->mutex, NULL); - INIT_LIST_HEAD (&priv->saved_fds); + if (!priv->shd.iamshd) { + ret = 0; + goto out; + } + + ret = -ENOMEM; + priv->shd.pos = GF_CALLOC (sizeof (*priv->shd.pos), child_count, + gf_afr_mt_brick_pos_t); + if (!priv->shd.pos) + goto out; + + priv->shd.pending = GF_CALLOC (sizeof (*priv->shd.pending), child_count, + gf_afr_mt_int32_t); + if (!priv->shd.pending) + goto out; + + priv->shd.inprogress = GF_CALLOC (sizeof (*priv->shd.inprogress), + child_count, gf_afr_mt_shd_bool_t); + if (!priv->shd.inprogress) + goto out; + priv->shd.timer = GF_CALLOC (sizeof (*priv->shd.timer), child_count, + gf_afr_mt_shd_timer_t); + if (!priv->shd.timer) + goto out; + + priv->shd.healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false, + _destroy_shd_event_data); + if (!priv->shd.healed) + goto out; + + priv->shd.heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false, + _destroy_shd_event_data); + if (!priv->shd.heal_failed) + goto out; + priv->shd.split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false, + _destroy_shd_event_data); + if (!priv->shd.split_brain) + goto out; + + this->itable = inode_table_new (SHD_INODE_LRU_LIMIT, this); + if (!this->itable) + goto out; + priv->root_inode = inode_ref (this->itable->root); + GF_OPTION_INIT ("node-uuid", priv->shd.node_uuid, str, out); + GF_OPTION_INIT ("heal-timeout", priv->shd.timeout, int32, out); + ret = afr_initialise_statistics (this); + if (ret) + goto out; ret = 0; out: return ret; @@ -347,6 +472,13 @@ out: int fini (xlator_t *this) { + afr_private_t *priv = NULL; + + priv = this->private; + this->private = NULL; + afr_priv_destroy (priv); + if (this->itable);//I dont see any destroy func + return 0; } @@ -365,6 +497,9 @@ struct xlator_fops fops = { .finodelk = afr_finodelk, .entrylk = afr_entrylk, .fentrylk = afr_fentrylk, + .fallocate = afr_fallocate, + .discard = afr_discard, + .zerofill = afr_zerofill, /* inode read */ .access = afr_access, @@ -372,6 +507,7 @@ struct xlator_fops fops = { .fstat = afr_fstat, .readlink = afr_readlink, .getxattr = afr_getxattr, + .fgetxattr = afr_fgetxattr, .readv = afr_readv, /* inode write */ @@ -379,9 +515,11 @@ struct xlator_fops fops = { .truncate = afr_truncate, .ftruncate = afr_ftruncate, .setxattr = afr_setxattr, + .fsetxattr = afr_fsetxattr, .setattr = afr_setattr, .fsetattr = afr_fsetattr, .removexattr = afr_removexattr, + .fremovexattr = afr_fremovexattr, /* dir read */ .opendir = afr_opendir, @@ -414,33 +552,79 @@ struct xlator_cbks cbks = { struct volume_options options[] = { { .key = {"read-subvolume" }, - .type = GF_OPTION_TYPE_XLATOR + .type = GF_OPTION_TYPE_XLATOR, + .description = "inode-read fops happen only on one of the bricks in " + "replicate. Afr will prefer the one specified using " + "this option if it is not stale. Option value must be " + "one of the xlator names of the children. " + "Ex: <volname>-client-0 till " + "<volname>-client-<number-of-bricks - 1>" + }, + { .key = {"read-subvolume-index" }, + .type = GF_OPTION_TYPE_INT, + .default_value = "-1", + .description = "inode-read fops happen only on one of the bricks in " + "replicate. AFR will prefer the one specified using " + "this option if it is not stale. allowed options" + " include -1 till replica-count - 1" + }, + { .key = {"read-hash-mode" }, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = 2, + .default_value = "0", + .description = "inode-read fops happen only on one of the bricks in " + "replicate. AFR will prefer the one computed using " + "the method specified using this option" + "0 = first responder, " + "1 = hash by GFID of file (all clients use " + "same subvolume), " + "2 = hash by GFID of file and client PID", + }, + { .key = {"choose-local" }, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .description = "Choose a local subvolume(i.e. Brick) to read from if " + "read-subvolume is not explicitly set.", }, { .key = {"favorite-child"}, - .type = GF_OPTION_TYPE_XLATOR + .type = GF_OPTION_TYPE_XLATOR, + .description = "If a split-brain happens choose subvol/brick set by " + "this option as source." }, { .key = {"background-self-heal-count"}, .type = GF_OPTION_TYPE_INT, .min = 0, .default_value = "16", + .validate = GF_OPT_VALIDATE_MIN, + .description = "This specifies the number of self-heals that can be " + " performed in background without blocking the fop" }, { .key = {"data-self-heal"}, .type = GF_OPTION_TYPE_STR, - .default_value = "", .value = {"1", "on", "yes", "true", "enable", "0", "off", "no", "false", "disable", "open"}, .default_value = "on", + .description = "Using this option we can enable/disable data " + "self-heal on the file. \"open\" means data " + "self-heal action will only be triggered by file " + "open operations." }, { .key = {"data-self-heal-algorithm"}, .type = GF_OPTION_TYPE_STR, - .default_value = "", .description = "Select between \"full\", \"diff\". The " "\"full\" algorithm copies the entire file from " "source to sink. The \"diff\" algorithm copies to " "sink only those blocks whose checksums don't match " - "with those of source.", - .value = { "diff", "full", "" } + "with those of source. If no option is configured " + "the option is chosen dynamically as follows: " + "If the file does not exist on one of the sinks " + "or empty file exists or if the source file size is " + "about the same as page size the entire file will " + "be read and written i.e \"full\" algo, " + "otherwise \"diff\" algo is chosen.", + .value = { "diff", "full"} }, { .key = {"data-self-heal-window-size"}, .type = GF_OPTION_TYPE_INT, @@ -453,26 +637,43 @@ struct volume_options options[] = { { .key = {"metadata-self-heal"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", + .description = "Using this option we can enable/disable metadata " + "i.e. Permissions, ownerships, xattrs self-heal on " + "the file/directory." }, { .key = {"entry-self-heal"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", + .description = "Using this option we can enable/disable entry " + "self-heal on the directory." }, { .key = {"data-change-log"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", + .description = "Data fops like write/truncate will not perform " + "pre/post fop changelog operations in afr transaction " + "if this option is disabled" }, { .key = {"metadata-change-log"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", + .description = "Metadata fops like setattr/setxattr will not perform " + "pre/post fop changelog operations in afr transaction " + "if this option is disabled" }, { .key = {"entry-change-log"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", + .description = "Entry fops like create/unlink will not perform " + "pre/post fop changelog operations in afr transaction " + "if this option is disabled" }, { .key = {"optimistic-change-log"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", + .description = "Entry/Metadata fops will not perform " + "pre fop changelog operations in afr transaction " + "if this option is enabled." }, { .key = {"strict-readdir"}, .type = GF_OPTION_TYPE_BOOL, @@ -481,14 +682,112 @@ struct volume_options options[] = { { .key = {"inodelk-trace"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", + .description = "Enabling this option logs inode lock/unlocks" }, { .key = {"entrylk-trace"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", + .description = "Enabling this option logs entry lock/unlocks" + }, + { .key = {"eager-lock"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Lock phase of a transaction has two sub-phases. " + "First is an attempt to acquire locks in parallel by " + "broadcasting non-blocking lock requests. If lock " + "aquistion fails on any server, then the held locks " + "are unlocked and revert to a blocking locked mode " + "sequentially on one server after another. If this " + "option is enabled the initial broadcasting lock " + "request attempt to acquire lock on the entire file. " + "If this fails, we revert back to the sequential " + "\"regional\" blocking lock as before. In the case " + "where such an \"eager\" lock is granted in the " + "non-blocking phase, it gives rise to an opportunity " + "for optimization. i.e, if the next write transaction " + "on the same FD arrives before the unlock phase of " + "the first transaction, it \"takes over\" the full " + "file lock. Similarly if yet another data transaction " + "arrives before the unlock phase of the \"optimized\" " + "transaction, that in turn \"takes over\" the lock as " + "well. The actual unlock now happens at the end of " + "the last \"optimzed\" transaction." + }, { .key = {"self-heal-daemon"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", + .description = "This option applies to only self-heal-daemon. " + "Index directory crawl and automatic healing of files" + "will not be performed if this option is turned off." + }, + { .key = {"iam-self-heal-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option differentiates if the replicate " + "translator is running as part of self-heal-daemon " + "or not." + }, + { .key = {"quorum-type"}, + .type = GF_OPTION_TYPE_STR, + .value = { "none", "auto", "fixed"}, + .default_value = "none", + .description = "If value is \"fixed\" only allow writes if " + "quorum-count bricks are present. If value is " + "\"auto\" only allow writes if more than half of " + "bricks, or exactly half including the first, are " + "present.", + }, + { .key = {"quorum-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = INT_MAX, + .default_value = 0, + .description = "If quorum-type is \"fixed\" only allow writes if " + "this many bricks or present. Other quorum types " + "will OVERWRITE this value.", + }, + { .key = {"node-uuid"}, + .type = GF_OPTION_TYPE_STR, + .description = "Local glusterd uuid string, used in starting " + "self-heal-daemon so that it can crawl only on " + "local index directories.", + }, + { .key = {"heal-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 60, + .max = INT_MAX, + .default_value = "600", + .description = "time interval for checking the need to self-heal " + "in self-heal-daemon" + }, + { .key = {"post-op-delay-secs"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = INT_MAX, + .default_value = "1", + .description = "Time interval induced artificially before " + "post-operation phase of the transaction to " + "enhance overlap of adjacent write operations.", + }, + { .key = {AFR_SH_READDIR_SIZE_KEY}, + .type = GF_OPTION_TYPE_SIZET, + .description = "readdirp size for performing entry self-heal", + .min = 1024, + .max = 131072, + .default_value = "1KB", + }, + { .key = {"readdir-failover"}, + .type = GF_OPTION_TYPE_BOOL, + .description = "readdir(p) will not failover if this option is off", + .default_value = "on", + }, + { .key = {"ensure-durability"}, + .type = GF_OPTION_TYPE_BOOL, + .description = "Afr performs fsyncs for transactions if this " + "option is on to make sure the changelogs/data is " + "written to the disk", + .default_value = "on", }, { .key = {NULL} }, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 45b09f7dd..21064db58 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -32,9 +23,15 @@ #include "afr-self-heal-algorithm.h" #include "libxlator.h" +#include "timer.h" #define AFR_XATTR_PREFIX "trusted.afr" #define AFR_PATHINFO_HEADER "REPLICATE:" +#define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size" +#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal" + +#define AFR_LOCKEE_COUNT_MAX 3 +#define AFR_DOM_COUNT_MAX 3 struct _pump_private; @@ -43,8 +40,7 @@ typedef int (*afr_expunge_done_cbk_t) (call_frame_t *frame, xlator_t *this, int32_t op_errno); typedef int (*afr_impunge_done_cbk_t) (call_frame_t *frame, xlator_t *this, - int child, int32_t op_error, - int32_t op_errno); + int32_t op_error, int32_t op_errno); typedef int (*afr_post_remove_call_t) (call_frame_t *frame, xlator_t *this); typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this); @@ -58,13 +54,16 @@ typedef enum { } afr_child_pos_t; typedef enum { + SPLIT_BRAIN = 1, + ALL_FOOLS = 2 +} afr_subvol_status_t; + +typedef enum { AFR_INODE_SET_READ_CTX = 1, AFR_INODE_RM_STALE_CHILDREN, AFR_INODE_SET_OPENDIR_DONE, - AFR_INODE_SET_SPLIT_BRAIN, AFR_INODE_GET_READ_CTX, AFR_INODE_GET_OPENDIR_DONE, - AFR_INODE_GET_SPLIT_BRAIN, } afr_inode_op_t; typedef struct afr_inode_params_ { @@ -78,16 +77,41 @@ typedef struct afr_inode_params_ { } u; } afr_inode_params_t; +typedef enum afr_spb_state { + DONT_KNOW, + SPB, + NO_SPB +} afr_spb_state_t; + typedef struct afr_inode_ctx_ { uint64_t masks; int32_t *fresh_children;//increasing order of latency + afr_spb_state_t mdata_spb; + afr_spb_state_t data_spb; + uint32_t open_fd_count; } afr_inode_ctx_t; +typedef enum { + NONE, + INDEX, + INDEX_TO_BE_HEALED, + FULL, +} afr_crawl_type_t; + typedef struct afr_self_heald_ { - gf_boolean_t enabled; - gf_boolean_t pending; - gf_boolean_t inprogress; - afr_child_pos_t *pos; + gf_boolean_t enabled; + gf_boolean_t iamshd; + afr_crawl_type_t *pending; + gf_boolean_t *inprogress; + afr_child_pos_t *pos; + gf_timer_t **timer; + eh_t *healed; + eh_t *heal_failed; + eh_t *split_brain; + eh_t **statistics; + void **crawl_events; + char *node_uuid; + int timeout; } afr_self_heald_t; typedef struct _afr_private { @@ -99,7 +123,6 @@ typedef struct _afr_private { xlator_t **children; - gf_lock_t root_inode_lk; int first_lookup; inode_t *root_inode; @@ -122,13 +145,10 @@ typedef struct _afr_private { gf_boolean_t entry_change_log; /* on/off */ int read_child; /* read-subvolume */ + unsigned int hash_mode; /* for when read_child is not set */ int favorite_child; /* subvolume to be preferred in resolving split-brain cases */ - unsigned int data_lock_server_count; - unsigned int metadata_lock_server_count; - unsigned int entry_lock_server_count; - gf_boolean_t inodelk_trace; gf_boolean_t entrylk_trace; @@ -144,15 +164,50 @@ typedef struct _afr_private { pthread_mutex_t mutex; struct list_head saved_fds; /* list of fds on which locks have succeeded */ - gf_boolean_t optimistic_change_log; - gf_boolean_t eager_lock; + gf_boolean_t optimistic_change_log; + gf_boolean_t eager_lock; + uint32_t post_op_delay_secs; + unsigned int quorum_count; char vol_uuid[UUID_SIZE + 1]; int32_t *last_event; afr_self_heald_t shd; + gf_boolean_t choose_local; + gf_boolean_t did_discovery; + gf_boolean_t readdir_failover; + uint64_t sh_readdir_size; + gf_boolean_t ensure_durability; + char *sh_domain; } afr_private_t; +typedef enum { + AFR_SELF_HEAL_NOT_ATTEMPTED, + AFR_SELF_HEAL_STARTED, + AFR_SELF_HEAL_FAILED, + AFR_SELF_HEAL_SYNC_BEGIN, +} afr_self_heal_status; + typedef struct { + afr_self_heal_status gfid_or_missing_entry_self_heal; + afr_self_heal_status metadata_self_heal; + afr_self_heal_status data_self_heal; + afr_self_heal_status entry_self_heal; +} afr_sh_status_for_all_type; + +typedef enum { + AFR_SELF_HEAL_ENTRY, + AFR_SELF_HEAL_METADATA, + AFR_SELF_HEAL_DATA, + AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY, + AFR_SELF_HEAL_INVALID = -1, +} afr_self_heal_type; + +typedef enum { + AFR_CHECK_ALL, + AFR_CHECK_SPECIFIC, +} afr_sh_fail_check_type; + +struct afr_self_heal_ { /* External interface: These are variables (some optional) that are set by whoever has triggered self-heal */ @@ -161,6 +216,8 @@ typedef struct { gf_boolean_t do_entry_self_heal; gf_boolean_t do_gfid_self_heal; gf_boolean_t do_missing_entry_self_heal; + gf_boolean_t force_confirm_spb; /* Check for split-brains even when + self-heal is turned off */ gf_boolean_t forced_merge; /* Is this a self-heal triggered to forcibly merge the directories? */ @@ -178,7 +235,7 @@ typedef struct { background, this function will be called as soon as possible. */ int (*unwind) (call_frame_t *frame, xlator_t *this, int32_t op_ret, - int32_t op_errno); + int32_t op_errno, int32_t sh_failed); /* End of external interface members */ @@ -191,7 +248,6 @@ typedef struct { afr_expunge_done_cbk_t expunge_done; afr_impunge_done_cbk_t impunge_done; - int32_t impunge_ret_child; /* array of xattr's, one for each child */ dict_t **xattr; @@ -225,12 +281,13 @@ typedef struct { unsigned char *locked_nodes; int lock_count; - mode_t impunging_entry_mode; const char *linkname; + gf_boolean_t entries_skipped; - int op_failed; - + gf_boolean_t actual_sh_started; + gf_boolean_t sync_done; gf_boolean_t data_lock_held; + gf_boolean_t sh_dom_lock_held; gf_boolean_t eof_reached; fd_t *healing_fd; int file_has_holes; @@ -241,25 +298,32 @@ typedef struct { uint8_t *checksum; afr_post_remove_call_t post_remove_call; - loc_t parent_loc; + char *data_sh_info; + char *metadata_sh_info; + loc_t parent_loc; call_frame_t *orig_frame; call_frame_t *old_loop_frame; gf_boolean_t unwound; afr_sh_algo_private_t *private; + afr_sh_status_for_all_type afr_all_sh_status; + afr_self_heal_type sh_type_in_action; + struct afr_sh_algorithm *algo; afr_lock_cbk_t data_lock_success_handler; afr_lock_cbk_t data_lock_failure_handler; + gf_boolean_t data_lock_block; int (*completion_cbk) (call_frame_t *frame, xlator_t *this); int (*sh_data_algo_start) (call_frame_t *frame, xlator_t *this); int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this); - afr_lock_cbk_t loop_completion_cbk; int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this); void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this); call_frame_t *sh_frame; -} afr_self_heal_t; +}; + +typedef struct afr_self_heal_ afr_self_heal_t; typedef enum { AFR_DATA_TRANSACTION, /* truncate, write, ... */ @@ -321,11 +385,31 @@ afr_index_for_transaction_type (afr_transaction_type type) return -1; /* make gcc happy */ } +typedef struct { + loc_t loc; + char *basename; + unsigned char *locked_nodes; + int locked_count; + +} afr_entry_lockee_t; + +int +afr_entry_lockee_cmp (const void *l1, const void *l2); + +typedef struct { + char *domain; /* Domain on which inodelk is taken */ + struct gf_flock flock; + unsigned char *locked_nodes; + int32_t lock_count; +} afr_inodelk_t; typedef struct { loc_t *lk_loc; - struct gf_flock lk_flock; + int lockee_count; + afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX]; + + afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX]; const char *lk_basename; const char *lower_basename; const char *higher_basename; @@ -334,23 +418,22 @@ typedef struct { unsigned char *locked_nodes; unsigned char *lower_locked_nodes; - unsigned char *inode_locked_nodes; - unsigned char *entry_locked_nodes; selfheal_lk_type_t selfheal_lk_type; transaction_lk_type_t transaction_lk_type; int32_t lock_count; - int32_t inodelk_lock_count; int32_t entrylk_lock_count; uint64_t lock_number; int32_t lk_call_count; int32_t lk_expected_count; + int32_t lk_attempted_count; int32_t lock_op_ret; int32_t lock_op_errno; afr_lock_cbk_t lock_cbk; + char *domain; /* Domain on which inode/entry lock/unlock in progress.*/ } afr_internal_lock_t; typedef struct _afr_locked_fd { @@ -358,21 +441,29 @@ typedef struct _afr_locked_fd { struct list_head list; } afr_locked_fd_t; +struct afr_reply { + int valid; + int32_t op_ret; + int32_t op_errno; +}; + typedef struct _afr_local { int uid; int gid; unsigned int call_count; unsigned int success_count; unsigned int enoent_count; + uint32_t open_fd_count; + gf_boolean_t update_open_fd_count; - unsigned int govinda_gOvinda; + unsigned int unhealable; unsigned int read_child_index; unsigned char read_child_returned; unsigned int first_up_child; - pid_t saved_pid; + gf_lkowner_t saved_lk_owner; int32_t op_ret; int32_t op_errno; @@ -383,7 +474,6 @@ typedef struct _afr_local { loc_t newloc; fd_t *fd; - unsigned char *fd_open_on; glusterfs_fop_t fop; @@ -405,13 +495,25 @@ typedef struct _afr_local { dict_t *dict; int optimistic_change_log; + gf_boolean_t delayed_post_op; + - gf_boolean_t fop_paused; - int (*fop_call_continue) (call_frame_t *frame, xlator_t *this); + /* Is the current writev() going to perform a stable write? + i.e, is fd->flags or @flags writev param have O_SYNC or + O_DSYNC? + */ + gf_boolean_t stable_write; - /* - This struct contains the arguments for the "continuation" - (scheme-like) of fops + /* This write appended to the file. Nnot necessarily O_APPEND, + just means the offset of write was at the end of file. + */ + gf_boolean_t append_write; + + int allow_sh_for_running_transaction; + + + /* This struct contains the arguments for the "continuation" + (scheme-like) of fops */ int op; @@ -422,6 +524,7 @@ typedef struct _afr_local { } statfs; struct { + uint32_t parent_entrylk; uuid_t gfid_req; inode_t *inode; struct iatt buf; @@ -433,11 +536,13 @@ typedef struct _afr_local { int32_t read_child; int32_t *sources; int32_t *success_children; + int32_t **pending_matrix; + gf_boolean_t fresh_lookup; + gf_boolean_t possible_spb; } lookup; struct { int32_t flags; - int32_t wbflags; } open; struct { @@ -470,13 +575,14 @@ typedef struct _afr_local { struct { char *name; int last_index; - long pathinfo_len; + long xattr_len; } getxattr; struct { size_t size; off_t offset; int last_index; + uint32_t flags; } readv; /* dir read */ @@ -494,7 +600,7 @@ typedef struct _afr_local { int32_t op_errno; size_t size; off_t offset; - + dict_t *dict; gf_boolean_t failed; int last_index; } readdir; @@ -503,44 +609,34 @@ typedef struct _afr_local { struct { struct iatt prebuf; struct iatt postbuf; + } inode_wfop; //common structure for all inode-write-fops + struct { int32_t op_ret; struct iovec *vector; struct iobref *iobref; int32_t count; off_t offset; + uint32_t flags; } writev; struct { - struct iatt prebuf; - struct iatt postbuf; - } fsync; - - struct { off_t offset; - struct iatt prebuf; - struct iatt postbuf; } truncate; struct { off_t offset; - struct iatt prebuf; - struct iatt postbuf; } ftruncate; struct { struct iatt in_buf; int32_t valid; - struct iatt preop_buf; - struct iatt postop_buf; } setattr; struct { struct iatt in_buf; int32_t valid; - struct iatt preop_buf; - struct iatt postop_buf; } fsetattr; struct { @@ -549,96 +645,85 @@ typedef struct _afr_local { } setxattr; struct { + dict_t *dict; + int32_t flags; + } fsetxattr; + + struct { char *name; } removexattr; + struct { + dict_t *xattr; + } xattrop; + + struct { + dict_t *xattr; + } fxattrop; + /* dir write */ struct { - fd_t *fd; - dict_t *params; - int32_t flags; - mode_t mode; inode_t *inode; struct iatt buf; struct iatt preparent; struct iatt postparent; - struct iatt read_child_buf; + struct iatt prenewparent; + struct iatt postnewparent; + } dir_fop; //common structure for all dir fops + + struct { + fd_t *fd; + dict_t *params; + int32_t flags; + mode_t mode; } create; struct { dev_t dev; mode_t mode; dict_t *params; - inode_t *inode; - struct iatt buf; - struct iatt preparent; - struct iatt postparent; - struct iatt read_child_buf; } mknod; struct { int32_t mode; dict_t *params; - inode_t *inode; - struct iatt buf; - struct iatt read_child_buf; - struct iatt preparent; - struct iatt postparent; } mkdir; struct { - int32_t op_ret; - int32_t op_errno; - struct iatt preparent; - struct iatt postparent; - } unlink; - - struct { - int flags; - int32_t op_ret; - int32_t op_errno; - struct iatt preparent; - struct iatt postparent; + int flags; } rmdir; struct { - struct iatt buf; - struct iatt read_child_buf; - struct iatt preoldparent; - struct iatt prenewparent; - struct iatt postoldparent; - struct iatt postnewparent; - } rename; - - struct { - inode_t *inode; - struct iatt buf; - struct iatt read_child_buf; - struct iatt preparent; - struct iatt postparent; - } link; - - struct { - inode_t *inode; dict_t *params; - struct iatt buf; - struct iatt read_child_buf; char *linkpath; - struct iatt preparent; - struct iatt postparent; } symlink; + struct { + int32_t mode; + off_t offset; + size_t len; + } fallocate; + + struct { + off_t offset; + size_t len; + } discard; + struct { - int32_t flags; - dir_entry_t *entries; - int32_t count; - } setdents; + off_t offset; + size_t len; + struct iatt prebuf; + struct iatt postbuf; + } zerofill; + + } cont; struct { off_t start, len; + gf_boolean_t eager_lock_on; int *eager_lock; char *basename; @@ -649,12 +734,18 @@ typedef struct _afr_local { afr_transaction_type type; - int success_count; - int erase_pending; - int failure_count; + /* pre-compute the post piggyback status before + entering POST-OP phase + */ + int *postop_piggybacked; - int last_tried; - int32_t *child_errno; + /* stub to resume on destruction + of the transaction frame */ + call_stub_t *resume_stub; + + struct list_head eager_locked; + + int32_t **txn_changelog;//changelog after pre+post ops unsigned char *pre_op; call_frame_t *main_frame; @@ -673,6 +764,15 @@ typedef struct _afr_local { afr_self_heal_t self_heal; struct marker_str marker; + + /* extra data for fops */ + dict_t *xdata_req; + dict_t *xdata_rsp; + + mode_t umask; + int xflag; + gf_boolean_t do_discovery; + struct afr_reply *replies; } afr_local_t; typedef enum { @@ -682,11 +782,6 @@ typedef enum { } afr_fd_open_status_t; typedef struct { - struct list_head call_list; - call_frame_t *frame; -} afr_fd_paused_call_t; - -typedef struct { unsigned int *pre_op_done; afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ unsigned int *pre_op_piggyback; @@ -695,7 +790,6 @@ typedef struct { unsigned int *lock_acquired; int flags; - int32_t wbflags; uint64_t up_count; /* number of CHILD_UPs this fd has seen */ uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */ @@ -706,20 +800,32 @@ typedef struct { struct list_head entries; /* needed for readdir failover */ unsigned char *locked_on; /* which subvolumes locks have been successful */ - struct list_head paused_calls; /* queued calls while fix_open happens */ + + /* used for delayed-post-op optimization */ + pthread_mutex_t delay_lock; + gf_timer_t *delay_timer; + call_frame_t *delay_frame; + int call_child; + + /* set if any write on this fd was a non stable write + (i.e, without O_SYNC or O_DSYNC) + */ + gf_boolean_t witnessed_unstable_write; + + /* list of frames currently in progress */ + struct list_head eager_locked; } afr_fd_ctx_t; /* try alloc and if it fails, goto label */ -#define ALLOC_OR_GOTO(var, type, label) do { \ - var = GF_CALLOC (sizeof (type), 1, \ - gf_afr_mt_##type); \ - if (!var) { \ - gf_log (this->name, GF_LOG_ERROR, \ - "out of memory :("); \ - op_errno = ENOMEM; \ - goto label; \ - } \ +#define AFR_LOCAL_ALLOC_OR_GOTO(var, label) do { \ + var = mem_get0 (THIS->local_pool); \ + if (!var) { \ + gf_log (this->name, GF_LOG_ERROR, \ + "out of memory :("); \ + op_errno = ENOMEM; \ + goto label; \ + } \ } while (0); @@ -740,8 +846,14 @@ int pump_command_reply (call_frame_t *frame, xlator_t *this); int32_t -afr_notify (xlator_t *this, int32_t event, - void *data, ...); +afr_notify (xlator_t *this, int32_t event, void *data, void *data2); + +int +afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local, + loc_t *loc, char *basename, int child_count); + +void +afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock); int afr_attempt_lock_recovery (xlator_t *this, int32_t child_index); @@ -754,7 +866,7 @@ afr_mark_locked_nodes (xlator_t *this, fd_t *fd, unsigned char *locked_nodes); void -afr_set_lk_owner (call_frame_t *frame, xlator_t *this); +afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner); int afr_set_lock_number (call_frame_t *frame, xlator_t *this); @@ -778,10 +890,16 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this); int afr_internal_lock_finish (call_frame_t *frame, xlator_t *this); +int +afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, + unsigned int child_count); int pump_start (call_frame_t *frame, xlator_t *this); int +__afr_fd_ctx_set (xlator_t *this, fd_t *fd); + +int afr_fd_ctx_set (xlator_t *this, fd_t *fd); int32_t @@ -791,8 +909,8 @@ void afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, int32_t *fresh_children); -void -afr_build_parent_loc (loc_t *parent, loc_t *child); +int +afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno); unsigned int afr_up_children_count (unsigned char *child_up, unsigned int child_count); @@ -823,11 +941,12 @@ gf_boolean_t afr_is_split_brain (xlator_t *this, inode_t *inode); void -afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set); +afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, + afr_spb_state_t data_spb); int afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, int32_t wbflags); + fd_t *fd, dict_t *xdata); void afr_set_opendir_done (xlator_t *this, inode_t *inode); @@ -854,22 +973,27 @@ afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd); frame->local = NULL; \ } \ STACK_UNWIND_STRICT (fop, frame, params); \ - afr_local_cleanup (__local, __this); \ - GF_FREE (__local); \ - } while (0); + if (__local) { \ + afr_local_cleanup (__local, __this); \ + mem_put (__local); \ + } \ + } while (0) -#define AFR_STACK_DESTROY(frame) \ - do { \ - afr_local_t *__local = NULL; \ - xlator_t *__this = NULL; \ - __local = frame->local; \ - __this = frame->this; \ - frame->local = NULL; \ - STACK_DESTROY (frame->root); \ - afr_local_cleanup (__local, __this); \ - GF_FREE (__local); \ +#define AFR_STACK_DESTROY(frame) \ + do { \ + afr_local_t *__local = NULL; \ + xlator_t *__this = NULL; \ + __local = frame->local; \ + __this = frame->this; \ + frame->local = NULL; \ + STACK_DESTROY (frame->root); \ + if (__local) { \ + afr_local_cleanup (__local, __this); \ + mem_put (__local); \ + } \ } while (0); +#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ /* allocate and return a string that is the basename of argument */ static inline char * AFR_BASENAME (const char *str) @@ -893,7 +1017,7 @@ int32_t * afr_children_create (int32_t child_count); int -AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv); +afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno); int afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, @@ -905,12 +1029,13 @@ afr_first_up_child (unsigned char *child_up, size_t child_count); int afr_select_read_child_from_policy (int32_t *fresh_children, int32_t child_count, int32_t prev_read_child, - int32_t config_read_child, int32_t *sources); + int32_t config_read_child, int32_t *sources, + unsigned int hmode, uuid_t gfid); void afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, int32_t *fresh_children, int32_t prev_read_child, - int32_t config_read_child); + int32_t config_read_child, uuid_t gfid); int32_t afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, @@ -932,8 +1057,9 @@ afr_children_rm_child (int32_t *children, int32_t child, int32_t child_count); void afr_reset_children (int32_t *children, int32_t child_count); -gf_boolean_t -afr_error_more_important (int32_t old_errno, int32_t new_errno); +int32_t +afr_most_important_error(int32_t old_errno, int32_t new_errno, + gf_boolean_t eio); int afr_errno_count (int32_t *children, int *child_errno, unsigned int child_count, int32_t op_errno); @@ -966,7 +1092,7 @@ int32_t afr_resultant_errno_get (int32_t *children, int *child_errno, unsigned int child_count); void -afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, int32_t read_child, +afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, int32_t *stale_children); void afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, @@ -974,12 +1100,13 @@ afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this), int (*unwind) (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno)); -int -afr_fix_open (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx, - int need_open_count, int *need_open); -int -afr_open_fd_fix (call_frame_t *frame, xlator_t *this, gf_boolean_t pause_fop); + int32_t op_ret, int32_t op_errno, + int32_t sh_failed)); +void +afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open); + +void +afr_open_fd_fix (fd_t *fd, xlator_t *this); int afr_set_elem_count_get (unsigned char *elems, int child_count); @@ -996,5 +1123,90 @@ void afr_set_low_priority (call_frame_t *frame); int afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child, - int flags, int32_t wb_flags); + int flags); + +gf_boolean_t +afr_have_quorum (char *logname, afr_private_t *priv); + +void +afr_matrix_cleanup (int32_t **pending, unsigned int m); + +int32_t** +afr_matrix_create (unsigned int m, unsigned int n); + +gf_boolean_t +afr_is_errno_set (int *child_errno, int child); + +gf_boolean_t +afr_is_errno_unset (int *child_errno, int child); + +gf_boolean_t +afr_is_fd_fixable (fd_t *fd); + +void +afr_prepare_new_entry_pending_matrix (int32_t **pending, + gf_boolean_t (*is_pending) (int *, int), + int *ctx, struct iatt *buf, + unsigned int child_count); +void +afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count); +/* + * Special value indicating we should use the "auto" quorum method instead of + * a fixed value (including zero to turn off quorum enforcement). + */ +#define AFR_QUORUM_AUTO INT_MAX + +/* + * Having this as a macro will make debugging a bit weirder, but does reduce + * the probability of functions handling this check inconsistently. + */ +#define QUORUM_CHECK(_func,_label) do { \ + if (priv->quorum_count && !afr_have_quorum(this->name,priv)) { \ + gf_log(this->name,GF_LOG_WARNING, \ + "failing "#_func" due to lack of quorum"); \ + op_errno = EROFS; \ + goto _label; \ + } \ +} while (0); + + +#define AFR_SBRAIN_MSG "Failed on %s as split-brain is seen. Returning EIO." + +#define AFR_SBRAIN_CHECK_FD(fd, label) do { \ + if (fd->inode && afr_is_split_brain (this, fd->inode)) { \ + op_errno = EIO; \ + gf_log (this->name, GF_LOG_WARNING, \ + AFR_SBRAIN_MSG ,uuid_utoa (fd->inode->gfid)); \ + goto label; \ + } \ +} while (0) + +#define AFR_SBRAIN_CHECK_LOC(loc, label) do { \ + if (loc->inode && afr_is_split_brain (this, loc->inode)) { \ + op_errno = EIO; \ + loc_path (loc, NULL); \ + gf_log (this->name, GF_LOG_WARNING, \ + AFR_SBRAIN_MSG , loc->path); \ + goto label; \ + } \ +} while (0) + +int +afr_fd_report_unstable_write (xlator_t *this, fd_t *fd); + +gf_boolean_t +afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd); + +void +afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub); + +int +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count); + +void +afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this); + +afr_inode_ctx_t* +afr_inode_ctx_get (inode_t *inode, xlator_t *this); + #endif /* __AFR_H__ */ diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c index 9ad29c69b..a7f72fb30 100644 --- a/xlators/cluster/afr/src/pump.c +++ b/xlators/cluster/afr/src/pump.c @@ -1,25 +1,17 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <unistd.h> #include <sys/time.h> #include <stdlib.h> +#include <fnmatch.h> #ifndef _CONFIG_H #define _CONFIG_H @@ -28,8 +20,16 @@ #include "afr-common.c" #include "defaults.c" +#include "glusterfs.h" static uint64_t pump_pid = 0; +static inline void +pump_fill_loc_info (loc_t *loc, struct iatt *iatt, struct iatt *parent) +{ + afr_update_loc_gfids (loc, iatt, parent); + uuid_copy (loc->inode->gfid, iatt->ia_gfid); +} + static int pump_mark_start_pending (xlator_t *this) { @@ -140,9 +140,7 @@ pump_set_resume_path (xlator_t *this, const char *path) LOCK (&pump_priv->resume_path_lock); { - pump_priv->resume_path = strdup (path); - if (!pump_priv->resume_path) - ret = -1; + strncpy (pump_priv->resume_path, path, strlen (path) + 1); } UNLOCK (&pump_priv->resume_path_lock); @@ -167,7 +165,7 @@ pump_save_path (xlator_t *this, const char *path) GF_ASSERT (priv->root_inode); - afr_build_root_loc (priv->root_inode, &loc); + afr_build_root_loc (this, &loc); dict = dict_new (); dict_ret = dict_set_str (dict, PUMP_PATH, (char *)path); @@ -187,6 +185,7 @@ pump_save_path (xlator_t *this, const char *path) dict_unref (dict); + loc_wipe (&loc); return 0; } @@ -323,7 +322,7 @@ pump_save_file_stats (xlator_t *this, const char *path) } static int -gf_pump_traverse_directory (loc_t *loc, uuid_t gfid) +gf_pump_traverse_directory (loc_t *loc) { xlator_t *this = NULL; fd_t *fd = NULL; @@ -337,15 +336,8 @@ gf_pump_traverse_directory (loc_t *loc, uuid_t gfid) dict_t *xattr_rsp = NULL; int ret = 0; gf_boolean_t is_directory_empty = _gf_true; - dict_t *xattr_req = NULL; gf_boolean_t free_entries = _gf_false; - xattr_req = dict_new (); - if (!xattr_req) { - ret = -1; - goto out; - } - INIT_LIST_HEAD (&entries.list); this = THIS; @@ -369,7 +361,7 @@ gf_pump_traverse_directory (loc_t *loc, uuid_t gfid) "pump opendir on %s returned=%d", loc->path, ret); - while (syncop_readdirp (this, fd, 131072, offset, &entries)) { + while (syncop_readdirp (this, fd, 131072, offset, NULL, &entries)) { free_entries = _gf_true; if (list_empty (&entries.list)) { @@ -382,9 +374,16 @@ gf_pump_traverse_directory (loc_t *loc, uuid_t gfid) gf_log (this->name, GF_LOG_DEBUG, "found readdir entry=%s", entry->d_name); + offset = entry->d_off; + if (uuid_is_null (entry->d_stat.ia_gfid)) { + gf_log (this->name, GF_LOG_WARNING, "%s/%s: No " + "gfid present skipping", + loc->path, entry->d_name); + continue; + } loc_wipe (&entry_loc); - ret = afr_build_child_loc (this, &entry_loc, - loc, entry->d_name); + ret = afr_build_child_loc (this, &entry_loc, loc, + entry->d_name); if (ret) goto out; @@ -397,24 +396,16 @@ gf_pump_traverse_directory (loc_t *loc, uuid_t gfid) entry_loc.path, iatt.ia_ino); - afr_generate_gfid_on_empty (gfid); - ret = dict_reset (xattr_req); - if (ret) - goto out; - ret = afr_set_dict_gfid (xattr_req, gfid); - if (ret) - goto out; - ret = syncop_lookup (this, &entry_loc, xattr_req, + ret = syncop_lookup (this, &entry_loc, NULL, &iatt, &xattr_rsp, &parent); - afr_empty_gfid_on_set (gfid, ret, &iatt); if (ret) { gf_log (this->name, GF_LOG_ERROR, "%s: lookup failed", entry_loc.path); continue; } - afr_fill_loc_info (&entry_loc, &iatt, + pump_fill_loc_info (&entry_loc, &iatt, &parent); pump_update_resume_state (this, entry_loc.path); @@ -434,11 +425,10 @@ gf_pump_traverse_directory (loc_t *loc, uuid_t gfid) gf_log (this->name, GF_LOG_TRACE, "entering dir=%s", entry->d_name); - gf_pump_traverse_directory (&entry_loc, gfid); + gf_pump_traverse_directory (&entry_loc); } } - } - offset = entry->d_off; + } } gf_dirent_free (&entries); @@ -449,6 +439,10 @@ gf_pump_traverse_directory (loc_t *loc, uuid_t gfid) } + ret = syncop_close (fd); + if (ret < 0) + gf_log (this->name, GF_LOG_DEBUG, "closing the fd failed"); + if (is_directory_empty && IS_ROOT_PATH (loc->path)) { pump_change_state (this, PUMP_STATE_RUNNING); gf_log (this->name, GF_LOG_INFO, "Empty source brick. " @@ -456,14 +450,11 @@ gf_pump_traverse_directory (loc_t *loc, uuid_t gfid) } out: - if (xattr_req) - dict_unref (xattr_req); if (entry_loc.path) loc_wipe (&entry_loc); if (free_entries) gf_dirent_free (&entries); return 0; - } static int @@ -491,7 +482,7 @@ pump_update_resume_path (xlator_t *this) static int32_t pump_xattr_cleaner (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_private_t *priv = NULL; loc_t loc = {0}; @@ -502,7 +493,7 @@ pump_xattr_cleaner (call_frame_t *frame, void *cookie, xlator_t *this, priv = this->private; - afr_build_root_loc (priv->root_inode, &loc); + afr_build_root_loc (this, &loc); ret = syncop_removexattr (priv->children[source], &loc, PUMP_PATH); @@ -518,6 +509,7 @@ pump_xattr_cleaner (call_frame_t *frame, void *cookie, xlator_t *this, "failed with %s", strerror (errno)); } + loc_wipe (&loc); return pump_command_reply (frame, this); } @@ -537,7 +529,7 @@ pump_complete_migration (xlator_t *this) GF_ASSERT (priv->root_inode); - afr_build_root_loc (priv->root_inode, &loc); + afr_build_root_loc (this, &loc); dict = dict_new (); @@ -579,6 +571,7 @@ pump_complete_migration (xlator_t *this) call_resume (pump_priv->cleaner); } + loc_wipe (&loc); return 0; } @@ -626,7 +619,6 @@ pump_task (void *data) struct iatt iatt, parent; dict_t *xattr_rsp = NULL; dict_t *xattr_req = NULL; - uuid_t gfid = {0}; int ret = -1; @@ -635,7 +627,7 @@ pump_task (void *data) GF_ASSERT (priv->root_inode); - afr_build_root_loc (priv->root_inode, &loc); + afr_build_root_loc (this, &loc); xattr_req = dict_new (); if (!xattr_req) { gf_log (this->name, GF_LOG_DEBUG, @@ -649,9 +641,8 @@ pump_task (void *data) &iatt, &xattr_rsp, &parent); gf_log (this->name, GF_LOG_TRACE, - "lookup: ino=%"PRId64", path=%s", - loc.ino, - loc.path); + "lookup: path=%s gfid=%s", + loc.path, uuid_utoa (loc.inode->gfid)); ret = pump_check_and_update_status (this); if (ret < 0) { @@ -667,13 +658,14 @@ pump_task (void *data) goto out; } - gf_pump_traverse_directory (&loc, gfid); + gf_pump_traverse_directory (&loc); pump_complete_migration (this); out: if (xattr_req) dict_unref (xattr_req); + loc_wipe (&loc); return 0; } @@ -707,7 +699,7 @@ pump_start (call_frame_t *pump_frame, xlator_t *this) priv = this->private; pump_priv = priv->pump_private; - pump_frame->root->lk_owner = (uint64_t) (unsigned long)pump_frame->root; + afr_set_lk_owner (pump_frame, this, pump_frame->root); pump_pid = (uint64_t) (unsigned long)pump_frame->root; ret = synctask_new (pump_priv->env, pump_task, @@ -721,8 +713,8 @@ pump_start (call_frame_t *pump_frame, xlator_t *this) } gf_log (this->name, GF_LOG_DEBUG, - "setting pump as started lk_owner: %"PRIu64" %"PRIu64, - pump_frame->root->lk_owner, pump_pid); + "setting pump as started lk_owner: %s %"PRIu64, + lkowner_utoa (&pump_frame->root->lk_owner), pump_pid); priv->use_afr_in_pump = 1; out: @@ -756,7 +748,7 @@ pump_cmd_start_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) + int32_t op_errno, dict_t *xdata) { call_frame_t *prev = NULL; @@ -808,9 +800,9 @@ pump_initiate_sink_connect (call_frame_t *frame, xlator_t *this) GF_ASSERT (priv->root_inode); - afr_build_root_loc (priv->root_inode, &loc); + afr_build_root_loc (this, &loc); - data = data_ref (dict_get (local->dict, PUMP_CMD_START)); + data = data_ref (dict_get (local->dict, RB_PUMP_CMD_START)); if (!data) { ret = -1; gf_log (this->name, GF_LOG_ERROR, @@ -849,7 +841,7 @@ pump_initiate_sink_connect (call_frame_t *frame, xlator_t *this) PUMP_SINK_CHILD(this)->fops->setxattr, &loc, dict, - 0); + 0, NULL); ret = 0; @@ -863,6 +855,7 @@ out: if (ret && clnt_cmd) GF_FREE (clnt_cmd); + loc_wipe (&loc); return ret; } @@ -882,7 +875,7 @@ pump_cmd_start_getxattr_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict) + dict_t *dict, dict_t *xdata) { afr_local_t *local = NULL; char *path = NULL; @@ -949,6 +942,7 @@ pump_execute_status (call_frame_t *frame, xlator_t *this) uint64_t number_files = 0; char filename[PATH_MAX]; + char summary[PATH_MAX+256]; char *dict_str = NULL; int32_t op_ret = 0; @@ -977,16 +971,19 @@ pump_execute_status (call_frame_t *frame, xlator_t *this) } if (pump_priv->pump_finished) { - snprintf (dict_str, PATH_MAX + 256, "Number of files migrated = %"PRIu64" Migration complete ", - number_files); + snprintf (summary, PATH_MAX+256, + "no_of_files=%"PRIu64, number_files); } else { - snprintf (dict_str, PATH_MAX + 256, "Number of files migrated = %"PRIu64" Current file= %s ", - number_files, filename); + snprintf (summary, PATH_MAX+256, + "no_of_files=%"PRIu64":current_file=%s", + number_files, filename); } + snprintf (dict_str, PATH_MAX+256, "status=%d:%s", + (pump_priv->pump_finished)?1:0, summary); dict = dict_new (); - ret = dict_set_dynstr (dict, PUMP_CMD_STATUS, dict_str); + ret = dict_set_dynstr (dict, RB_PUMP_CMD_STATUS, dict_str); if (ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "dict_set_dynstr returned negative value"); @@ -998,13 +995,12 @@ pump_execute_status (call_frame_t *frame, xlator_t *this) out: - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, NULL); if (dict) dict_unref (dict); - if (dict_str) - GF_FREE (dict_str); + GF_FREE (dict_str); return 0; } @@ -1046,14 +1042,14 @@ pump_execute_start (call_frame_t *frame, xlator_t *this) GF_ASSERT (priv->root_inode); - afr_build_root_loc (priv->root_inode, &loc); + afr_build_root_loc (this, &loc); STACK_WIND (frame, pump_cmd_start_getxattr_cbk, PUMP_SOURCE_CHILD(this), PUMP_SOURCE_CHILD(this)->fops->getxattr, &loc, - PUMP_PATH); + PUMP_PATH, NULL); ret = 0; @@ -1063,6 +1059,7 @@ out: pump_command_reply (frame, this); } + loc_wipe (&loc); return 0; } @@ -1070,7 +1067,7 @@ static int pump_cleanup_helper (void *data) { call_frame_t *frame = data; - pump_xattr_cleaner (frame, 0, frame->this, 0, 0); + pump_xattr_cleaner (frame, 0, frame->this, 0, 0, NULL); return 0; } @@ -1152,7 +1149,7 @@ pump_execute_abort (call_frame_t *frame, xlator_t *this) } else { pump_priv->cleaner = fop_setxattr_cbk_stub (frame, pump_xattr_cleaner, - 0, 0); + 0, 0, NULL); } return 0; @@ -1165,7 +1162,7 @@ pump_command_status (xlator_t *this, dict_t *dict) int dict_ret = -1; int ret = _gf_true; - dict_ret = dict_get_str (dict, PUMP_CMD_STATUS, &cmd); + dict_ret = dict_get_str (dict, RB_PUMP_CMD_STATUS, &cmd); if (dict_ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "Not a pump status command"); @@ -1189,7 +1186,7 @@ pump_command_pause (xlator_t *this, dict_t *dict) int dict_ret = -1; int ret = _gf_true; - dict_ret = dict_get_str (dict, PUMP_CMD_PAUSE, &cmd); + dict_ret = dict_get_str (dict, RB_PUMP_CMD_PAUSE, &cmd); if (dict_ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "Not a pump pause command"); @@ -1213,7 +1210,7 @@ pump_command_commit (xlator_t *this, dict_t *dict) int dict_ret = -1; int ret = _gf_true; - dict_ret = dict_get_str (dict, PUMP_CMD_COMMIT, &cmd); + dict_ret = dict_get_str (dict, RB_PUMP_CMD_COMMIT, &cmd); if (dict_ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "Not a pump commit command"); @@ -1237,7 +1234,7 @@ pump_command_abort (xlator_t *this, dict_t *dict) int dict_ret = -1; int ret = _gf_true; - dict_ret = dict_get_str (dict, PUMP_CMD_ABORT, &cmd); + dict_ret = dict_get_str (dict, RB_PUMP_CMD_ABORT, &cmd); if (dict_ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "Not a pump abort command"); @@ -1261,7 +1258,7 @@ pump_command_start (xlator_t *this, dict_t *dict) int dict_ret = -1; int ret = _gf_true; - dict_ret = dict_get_str (dict, PUMP_CMD_START, &cmd); + dict_ret = dict_get_str (dict, RB_PUMP_CMD_START, &cmd); if (dict_ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "Not a pump start command"); @@ -1283,7 +1280,7 @@ struct _xattr_key { struct list_head list; }; -static void +static int __gather_xattr_keys (dict_t *dict, char *key, data_t *value, void *data) { @@ -1295,13 +1292,14 @@ __gather_xattr_keys (dict_t *dict, char *key, data_t *value, xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key); if (!xkey) - return; + return -1; xkey->key = key; INIT_LIST_HEAD (&xkey->list); list_add_tail (&xkey->list, list); } + return 0; } static void @@ -1329,7 +1327,7 @@ __filter_xattrs (dict_t *dict) int32_t pump_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict) + dict_t *dict, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -1364,7 +1362,7 @@ pump_getxattr_cbk (call_frame_t *frame, void *cookie, children[next_call_child], children[next_call_child]->fops->getxattr, &local->loc, - local->cont.getxattr.name); + local->cont.getxattr.name, NULL); } out: @@ -1372,7 +1370,7 @@ out: if (op_ret >= 0 && dict) __filter_xattrs (dict); - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, NULL); } return 0; @@ -1380,13 +1378,13 @@ out: int32_t pump_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) + loc_t *loc, const char *name, dict_t *xdata) { afr_private_t * priv = NULL; xlator_t ** children = NULL; int call_child = 0; afr_local_t *local = NULL; - int32_t op_ret = -1; + int32_t ret = -1; int32_t op_errno = 0; uint64_t read_child = 0; @@ -1399,15 +1397,21 @@ pump_getxattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (priv->children, out); children = priv->children; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, default_getxattr_cbk, + FIRST_CHILD (this), + (FIRST_CHILD (this))->fops->getxattr, + loc, name, xdata); + return 0; + } - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; - op_ret = AFR_LOCAL_INIT (local, priv); - if (op_ret < 0) { - op_errno = -op_ret; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } if (name) { if (!strncmp (name, AFR_XATTR_PREFIX, @@ -1417,39 +1421,31 @@ pump_getxattr (call_frame_t *frame, xlator_t *this, goto out; } - if (!strcmp (name, PUMP_CMD_STATUS)) { + if (!strcmp (name, RB_PUMP_CMD_STATUS)) { gf_log (this->name, GF_LOG_DEBUG, "Hit pump command - status"); pump_execute_status (frame, this); - op_ret = 0; + ret = 0; goto out; } } - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, default_getxattr_cbk, - FIRST_CHILD (this), - (FIRST_CHILD (this))->fops->getxattr, - loc, name); - return 0; - } - local->fresh_children = GF_CALLOC (priv->child_count, sizeof (*local->fresh_children), gf_afr_mt_int32_t); - if (local->fresh_children) { + if (!local->fresh_children) { + ret = -1; op_errno = ENOMEM; goto out; } read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, + ret = afr_get_call_child (this, local->child_up, read_child, local->fresh_children, &call_child, &local->cont.getxattr.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; + if (ret < 0) { + op_errno = -ret; goto out; } loc_copy (&local->loc, loc); @@ -1459,13 +1455,12 @@ pump_getxattr (call_frame_t *frame, xlator_t *this, STACK_WIND_COOKIE (frame, pump_getxattr_cbk, (void *) (long) call_child, children[call_child], children[call_child]->fops->getxattr, - loc, name); + loc, name, xdata); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); return 0; } @@ -1487,14 +1482,14 @@ afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (setxattr, main_frame, - local->op_ret, local->op_errno) + local->op_ret, local->op_errno, NULL); } return 0; } static int afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t * local = NULL; afr_private_t * priv = NULL; @@ -1563,7 +1558,7 @@ afr_setxattr_wind (call_frame_t *frame, xlator_t *this) priv->children[i]->fops->setxattr, &local->loc, local->cont.setxattr.dict, - local->cont.setxattr.flags); + local->cont.setxattr.flags, NULL); if (!--call_count) break; @@ -1591,11 +1586,9 @@ pump_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) + int32_t op_errno, dict_t *xdata) { - STACK_UNWIND (frame, - op_ret, - op_errno); + AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); return 0; } @@ -1613,12 +1606,10 @@ pump_command_reply (call_frame_t *frame, xlator_t *this) gf_log (this->name, GF_LOG_INFO, "Command succeeded"); - dict_unref (local->dict); - AFR_STACK_UNWIND (setxattr, frame, local->op_ret, - local->op_errno); + local->op_errno, NULL); return 0; } @@ -1655,50 +1646,53 @@ pump_parse_command (call_frame_t *frame, xlator_t *this, int pump_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags) + loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; call_frame_t *transaction_frame = NULL; - int ret = -1; - - int op_ret = -1; int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (this->private, out); + GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.pump*", dict, + op_errno, out); + priv = this->private; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, default_setxattr_cbk, + FIRST_CHILD (this), + (FIRST_CHILD (this))->fops->setxattr, + loc, dict, flags, xdata); + return 0; + } + - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (local, out); - ret = AFR_LOCAL_INIT (local, priv); + ret = afr_local_init (local, priv, &op_errno); if (ret < 0) { - op_errno = -ret; + afr_local_cleanup (local, this); goto out; - } + } ret = pump_parse_command (frame, this, local, dict); if (ret >= 0) { - op_ret = 0; + ret = 0; goto out; } - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, default_setxattr_cbk, - FIRST_CHILD (this), - (FIRST_CHILD (this))->fops->setxattr, - loc, dict, flags); - return 0; - } - transaction_frame = copy_frame (frame); if (!transaction_frame) { gf_log (this->name, GF_LOG_ERROR, "Out of memory."); + op_errno = ENOMEM; + ret = -1; + afr_local_cleanup (local, this); goto out; } @@ -1721,12 +1715,12 @@ pump_setxattr (call_frame_t *frame, xlator_t *this, afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno); + AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); } return 0; @@ -1760,7 +1754,7 @@ static int32_t pump_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, - off_t offset) + off_t offset, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1770,11 +1764,11 @@ pump_truncate (call_frame_t *frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate, loc, - offset); + offset, xdata); return 0; } - afr_truncate (frame, this, loc, offset); + afr_truncate (frame, this, loc, offset, xdata); return 0; } @@ -1783,7 +1777,7 @@ static int32_t pump_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, - off_t offset) + off_t offset, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1793,11 +1787,11 @@ pump_ftruncate (call_frame_t *frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->ftruncate, fd, - offset); + offset, xdata); return 0; } - afr_ftruncate (frame, this, fd, offset); + afr_ftruncate (frame, this, fd, offset, xdata); return 0; } @@ -1806,7 +1800,7 @@ pump_ftruncate (call_frame_t *frame, int pump_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, dict_t *parms) + loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1814,10 +1808,10 @@ pump_mknod (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, default_mknod_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, - loc, mode, rdev, parms); + loc, mode, rdev, umask, xdata); return 0; } - afr_mknod (frame, this, loc, mode, rdev, parms); + afr_mknod (frame, this, loc, mode, rdev, umask, xdata); return 0; } @@ -1826,7 +1820,7 @@ pump_mknod (call_frame_t *frame, xlator_t *this, int pump_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *params) + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1834,10 +1828,10 @@ pump_mkdir (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, default_mkdir_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, - loc, mode, params); + loc, mode, umask, xdata); return 0; } - afr_mkdir (frame, this, loc, mode, params); + afr_mkdir (frame, this, loc, mode, umask, xdata); return 0; } @@ -1846,7 +1840,7 @@ pump_mkdir (call_frame_t *frame, xlator_t *this, static int32_t pump_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc) + loc_t *loc, int xflag, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1855,10 +1849,10 @@ pump_unlink (call_frame_t *frame, default_unlink_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, - loc); + loc, xflag, xdata); return 0; } - afr_unlink (frame, this, loc); + afr_unlink (frame, this, loc, xflag, xdata); return 0; } @@ -1866,7 +1860,7 @@ pump_unlink (call_frame_t *frame, static int pump_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags) + loc_t *loc, int flags, dict_t *xdata) { afr_private_t *priv = NULL; @@ -1876,11 +1870,11 @@ pump_rmdir (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, default_rmdir_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rmdir, - loc, flags); + loc, flags, xdata); return 0; } - afr_rmdir (frame, this, loc, flags); + afr_rmdir (frame, this, loc, flags, xdata); return 0; } @@ -1889,7 +1883,7 @@ pump_rmdir (call_frame_t *frame, xlator_t *this, int pump_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *loc, dict_t *params) + const char *linkpath, loc_t *loc, mode_t umask, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1897,10 +1891,10 @@ pump_symlink (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, default_symlink_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink, - linkpath, loc, params); + linkpath, loc, umask, xdata); return 0; } - afr_symlink (frame, this, linkpath, loc, params); + afr_symlink (frame, this, linkpath, loc, umask, xdata); return 0; } @@ -1910,7 +1904,7 @@ static int32_t pump_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc) + loc_t *newloc, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1919,10 +1913,10 @@ pump_rename (call_frame_t *frame, default_rename_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, - oldloc, newloc); + oldloc, newloc, xdata); return 0; } - afr_rename (frame, this, oldloc, newloc); + afr_rename (frame, this, oldloc, newloc, xdata); return 0; } @@ -1932,7 +1926,7 @@ static int32_t pump_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc) + loc_t *newloc, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1941,10 +1935,10 @@ pump_link (call_frame_t *frame, default_link_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, - oldloc, newloc); + oldloc, newloc, xdata); return 0; } - afr_link (frame, this, oldloc, newloc); + afr_link (frame, this, oldloc, newloc, xdata); return 0; } @@ -1953,7 +1947,7 @@ pump_link (call_frame_t *frame, static int32_t pump_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) + mode_t umask, fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1961,10 +1955,10 @@ pump_create (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, default_create_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->create, - loc, flags, mode, fd, params); + loc, flags, mode, umask, fd, xdata); return 0; } - afr_create (frame, this, loc, flags, mode, fd, params); + afr_create (frame, this, loc, flags, mode, umask, fd, xdata); return 0; } @@ -1974,8 +1968,7 @@ static int32_t pump_open (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flags, fd_t *fd, - int32_t wbflags) + int32_t flags, fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1984,10 +1977,10 @@ pump_open (call_frame_t *frame, default_open_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->open, - loc, flags, fd, wbflags); + loc, flags, fd, xdata); return 0; } - afr_open (frame, this, loc, flags, fd, wbflags); + afr_open (frame, this, loc, flags, fd, xdata); return 0; } @@ -1999,8 +1992,8 @@ pump_writev (call_frame_t *frame, fd_t *fd, struct iovec *vector, int32_t count, - off_t off, - struct iobref *iobref) + off_t off, uint32_t flags, + struct iobref *iobref, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2012,20 +2005,20 @@ pump_writev (call_frame_t *frame, fd, vector, count, - off, - iobref); + off, flags, + iobref, xdata); return 0; } - afr_writev (frame, this, fd, vector, count, off, iobref); - return 0; + afr_writev (frame, this, fd, vector, count, off, flags, iobref, xdata); + return 0; } static int32_t pump_flush (call_frame_t *frame, xlator_t *this, - fd_t *fd) + fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2034,10 +2027,10 @@ pump_flush (call_frame_t *frame, default_flush_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->flush, - fd); + fd, xdata); return 0; } - afr_flush (frame, this, fd); + afr_flush (frame, this, fd, xdata); return 0; } @@ -2047,7 +2040,7 @@ static int32_t pump_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t flags) + int32_t flags, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2057,10 +2050,10 @@ pump_fsync (call_frame_t *frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, fd, - flags); + flags, xdata); return 0; } - afr_fsync (frame, this, fd, flags); + afr_fsync (frame, this, fd, flags, xdata); return 0; } @@ -2069,7 +2062,7 @@ pump_fsync (call_frame_t *frame, static int32_t pump_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd) + loc_t *loc, fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2078,10 +2071,10 @@ pump_opendir (call_frame_t *frame, default_opendir_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->opendir, - loc, fd); + loc, fd, xdata); return 0; } - afr_opendir (frame, this, loc, fd); + afr_opendir (frame, this, loc, fd, xdata); return 0; } @@ -2091,7 +2084,7 @@ static int32_t pump_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t flags) + int32_t flags, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2101,10 +2094,10 @@ pump_fsyncdir (call_frame_t *frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsyncdir, fd, - flags); + flags, xdata); return 0; } - afr_fsyncdir (frame, this, fd, flags); + afr_fsyncdir (frame, this, fd, flags, xdata); return 0; } @@ -2115,7 +2108,7 @@ pump_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, gf_xattrop_flags_t flags, - dict_t *dict) + dict_t *dict, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2126,10 +2119,10 @@ pump_xattrop (call_frame_t *frame, FIRST_CHILD(this)->fops->xattrop, loc, flags, - dict); + dict, xdata); return 0; } - afr_xattrop (frame, this, loc, flags, dict); + afr_xattrop (frame, this, loc, flags, dict, xdata); return 0; } @@ -2139,7 +2132,7 @@ pump_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, gf_xattrop_flags_t flags, - dict_t *dict) + dict_t *dict, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2150,10 +2143,10 @@ pump_fxattrop (call_frame_t *frame, FIRST_CHILD(this)->fops->fxattrop, fd, flags, - dict); + dict, xdata); return 0; } - afr_fxattrop (frame, this, fd, flags, dict); + afr_fxattrop (frame, this, fd, flags, dict, xdata); return 0; } @@ -2163,9 +2156,17 @@ static int32_t pump_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name) + const char *name, dict_t *xdata) { - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO (this, out); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.pump*", + name, op_errno, out); + + op_errno = 0; priv = this->private; if (!priv->use_afr_in_pump) { STACK_WIND (frame, @@ -2173,10 +2174,14 @@ pump_removexattr (call_frame_t *frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->removexattr, loc, - name); + name, xdata); return 0; } - afr_removexattr (frame, this, loc, name); + afr_removexattr (frame, this, loc, name, xdata); + + out: + if (op_errno) + AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); return 0; } @@ -2188,7 +2193,7 @@ pump_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t off) + off_t off, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2197,21 +2202,18 @@ pump_readdir (call_frame_t *frame, default_readdir_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdir, - fd, size, off); + fd, size, off, xdata); return 0; } - afr_readdir (frame, this, fd, size, off); + afr_readdir (frame, this, fd, size, off, xdata); return 0; } static int32_t -pump_readdirp (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t off) +pump_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t off, dict_t *dict) { afr_private_t *priv = NULL; priv = this->private; @@ -2220,10 +2222,10 @@ pump_readdirp (call_frame_t *frame, default_readdirp_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp, - fd, size, off); + fd, size, off, dict); return 0; } - afr_readdirp (frame, this, fd, size, off); + afr_readdirp (frame, this, fd, size, off, dict); return 0; } @@ -2254,13 +2256,24 @@ pump_release (xlator_t *this, } +static int32_t +pump_forget (xlator_t *this, inode_t *inode) +{ + afr_private_t *priv = NULL; + + priv = this->private; + if (priv->use_afr_in_pump) + afr_forget (this, inode); + + return 0; +} static int32_t pump_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, - int32_t valid) + int32_t valid, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2269,10 +2282,10 @@ pump_setattr (call_frame_t *frame, default_setattr_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->setattr, - loc, stbuf, valid); + loc, stbuf, valid, xdata); return 0; } - afr_setattr (frame, this, loc, stbuf, valid); + afr_setattr (frame, this, loc, stbuf, valid, xdata); return 0; } @@ -2283,7 +2296,7 @@ pump_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, - int32_t valid) + int32_t valid, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2292,10 +2305,10 @@ pump_fsetattr (call_frame_t *frame, default_fsetattr_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetattr, - fd, stbuf, valid); + fd, stbuf, valid, xdata); return 0; } - afr_fsetattr (frame, this, fd, stbuf, valid); + afr_fsetattr (frame, this, fd, stbuf, valid, xdata); return 0; } @@ -2344,7 +2357,7 @@ notify (xlator_t *this, int32_t event, child_xl = (xlator_t *) data; - ret = afr_notify (this, event, data); + ret = afr_notify (this, event, data, NULL); switch (event) { case GF_EVENT_CHILD_DOWN: @@ -2395,9 +2408,26 @@ init (xlator_t *this) "Volume is dangling."); } - ALLOC_OR_GOTO (this->private, afr_private_t, out); + this->private = GF_CALLOC (1, sizeof (afr_private_t), + gf_afr_mt_afr_private_t); + if (!this->private) + goto out; priv = this->private; + LOCK_INIT (&priv->lock); + LOCK_INIT (&priv->read_child_lock); + //lock recovery is not done in afr + pthread_mutex_init (&priv->mutex, NULL); + INIT_LIST_HEAD (&priv->saved_fds); + + child_count = xlator_subvolume_count (this); + if (child_count != 2) { + gf_log (this->name, GF_LOG_ERROR, + "There should be exactly 2 children - one source " + "and one sink"); + return -1; + } + priv->child_count = child_count; priv->read_child = source_child; priv->favorite_child = source_child; @@ -2407,14 +2437,13 @@ init (xlator_t *this) priv->metadata_self_heal = 1; priv->entry_self_heal = 1; - priv->data_self_heal_algorithm = ""; - priv->data_self_heal_window_size = 16; priv->data_change_log = 1; priv->metadata_change_log = 1; priv->entry_change_log = 1; priv->use_afr_in_pump = 1; + priv->sh_readdir_size = 65536; /* Locking options */ @@ -2423,31 +2452,9 @@ init (xlator_t *this) and the sink. */ - priv->data_lock_server_count = 2; - priv->metadata_lock_server_count = 2; - priv->entry_lock_server_count = 2; - priv->strict_readdir = _gf_false; - trav = this->children; - while (trav) { - child_count++; - trav = trav->next; - } - priv->wait_count = 1; - - if (child_count != 2) { - gf_log (this->name, GF_LOG_ERROR, - "There should be exactly 2 children - one source " - "and one sink"); - return -1; - } - priv->child_count = child_count; - - LOCK_INIT (&priv->lock); - LOCK_INIT (&priv->read_child_lock); - priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, gf_afr_mt_char); if (!priv->child_up) { @@ -2494,6 +2501,12 @@ init (xlator_t *this) i++; } + ret = gf_asprintf (&priv->sh_domain, "%s-self-heal", this->name); + if (-1 == ret) { + op_errno = ENOMEM; + goto out; + } + priv->first_lookup = 1; priv->root_inode = NULL; @@ -2525,7 +2538,7 @@ init (xlator_t *this) goto out; } - pump_priv->env = syncenv_new (0); + pump_priv->env = this->ctx->env; if (!pump_priv->env) { gf_log (this->name, GF_LOG_ERROR, "Could not create new sync-environment"); @@ -2533,10 +2546,16 @@ init (xlator_t *this) goto out; } - priv->pump_private = pump_priv; + /* keep more local here as we may need them for self-heal etc */ + this->local_pool = mem_pool_new (afr_local_t, 128); + if (!this->local_pool) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + goto out; + } - pthread_mutex_init (&priv->mutex, NULL); - INIT_LIST_HEAD (&priv->saved_fds); + priv->pump_private = pump_priv; pump_change_state (this, PUMP_STATE_ABORT); @@ -2548,6 +2567,25 @@ out: int fini (xlator_t *this) { + afr_private_t * priv = NULL; + pump_private_t *pump_priv = NULL; + + priv = this->private; + this->private = NULL; + if (!priv) + goto out; + + pump_priv = priv->pump_private; + if (!pump_priv) + goto afr_priv; + + GF_FREE (pump_priv->resume_path); + LOCK_DESTROY (&pump_priv->resume_path_lock); + LOCK_DESTROY (&pump_priv->pump_state_lock); + GF_FREE (pump_priv); +afr_priv: + afr_priv_destroy (priv); +out: return 0; } @@ -2595,6 +2633,7 @@ struct xlator_dumpops dumpops = { struct xlator_cbks cbks = { .release = pump_release, .releasedir = pump_releasedir, + .forget = pump_forget, }; struct volume_options options[] = { diff --git a/xlators/cluster/afr/src/pump.h b/xlators/cluster/afr/src/pump.h index 02eede49c..bc4c31a78 100644 --- a/xlators/cluster/afr/src/pump.h +++ b/xlators/cluster/afr/src/pump.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __PUMP_H__ @@ -26,12 +17,6 @@ #define CLIENT_CMD_CONNECT "trusted.glusterfs.client-connect" #define CLIENT_CMD_DISCONNECT "trusted.glusterfs.client-disconnect" -#define PUMP_CMD_START "trusted.glusterfs.pump.start" -#define PUMP_CMD_COMMIT "trusted.glusterfs.pump.commit" -#define PUMP_CMD_ABORT "trusted.glusterfs.pump.abort" -#define PUMP_CMD_PAUSE "trusted.glusterfs.pump.pause" -#define PUMP_CMD_STATUS "trusted.glusterfs.pump.status" - #define PUMP_SOURCE_COMPLETE "trusted.glusterfs.pump-source-complete" #define PUMP_SINK_COMPLETE "trusted.glusterfs.pump-sink-complete" @@ -50,7 +35,7 @@ typedef enum { typedef struct _pump_private { struct syncenv *env; /* The env pointer to the pump synctask */ - const char *resume_path; /* path to resume from the last pause */ + char *resume_path; /* path to resume from the last pause */ gf_lock_t resume_path_lock; /* Synchronize resume_path changes */ gf_lock_t pump_state_lock; /* Synchronize pump_state changes */ pump_state_t pump_state; /* State of pump */ diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am index e35058d65..174bea841 100644 --- a/xlators/cluster/dht/src/Makefile.am +++ b/xlators/cluster/dht/src/Makefile.am @@ -4,7 +4,7 @@ xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c dht-rebalance.c \ dht-selfheal.c dht-rename.c dht-hashfn.c dht-diskusage.c \ - dht-common.c dht-inode-write.c dht-inode-read.c \ + dht-common.c dht-inode-write.c dht-inode-read.c dht-shared.c \ $(top_builddir)/xlators/lib/src/libxlator.c dht_la_SOURCES = $(dht_common_source) dht.c @@ -12,22 +12,23 @@ dht_la_SOURCES = $(dht_common_source) dht.c nufa_la_SOURCES = $(dht_common_source) nufa.c switch_la_SOURCES = $(dht_common_source) switch.c -dht_la_LDFLAGS = -module -avoidversion +dht_la_LDFLAGS = -module -avoid-version dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -nufa_la_LDFLAGS = -module -avoidversion +nufa_la_LDFLAGS = -module -avoid-version nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -switch_la_LDFLAGS = -module -avoidversion +switch_la_LDFLAGS = -module -avoid-version switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la noinst_HEADERS = dht-common.h dht-mem-types.h \ $(top_builddir)/xlators/lib/src/libxlator.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) \ +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ -I$(top_srcdir)/xlators/lib/src +AM_CFLAGS = -Wall $(GF_CFLAGS) + CLEANFILES = uninstall-local: diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index faf575df9..8f61339e6 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2009-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -31,17 +22,18 @@ #include "dht-common.h" #include "defaults.h" #include "byte-order.h" +#include "glusterfs-acl.h" #include <sys/time.h> #include <libgen.h> -void +int dht_aggregate (dict_t *this, char *key, data_t *value, void *data) { dict_t *dst = NULL; int64_t *ptr = 0, *size = NULL; int32_t ret = -1; - data_pair_t *data_pair = NULL; + data_t *dict_data = NULL; dst = data; @@ -53,32 +45,37 @@ dht_aggregate (dict_t *this, char *key, data_t *value, void *data) if (size == NULL) { gf_log ("dht", GF_LOG_WARNING, "memory allocation failed"); - return; + return -1; } ret = dict_set_bin (dst, key, size, sizeof (int64_t)); if (ret < 0) { gf_log ("dht", GF_LOG_WARNING, "dht aggregate dict set failed"); GF_FREE (size); - return; + return -1; } } ptr = data_to_bin (value); if (ptr == NULL) { gf_log ("dht", GF_LOG_WARNING, "data to bin failed"); - return; + return -1; } *size = hton64 (ntoh64 (*size) + ntoh64 (*ptr)); + + } else if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) { + ret = gf_get_min_stime (THIS, dst, key, value); + if (ret < 0) + return ret; } else { /* compare user xattrs only */ if (!strncmp (key, "user.", strlen ("user."))) { - ret = dict_lookup (dst, key, &data_pair); - if (!ret && data) { - ret = is_data_equal (data_pair->value, value); + ret = dict_lookup (dst, key, &dict_data); + if (!ret && dict_data && value) { + ret = is_data_equal (dict_data, value); if (!ret) - gf_log ("dht", GF_LOG_WARNING, + gf_log ("dht", GF_LOG_DEBUG, "xattr mismatch for %s", key); } } @@ -87,7 +84,7 @@ dht_aggregate (dict_t *this, char *key, data_t *value, void *data) gf_log ("dht", GF_LOG_WARNING, "xattr dict set failed"); } - return; + return 0; } @@ -114,7 +111,7 @@ out: int dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { dht_local_t *local = NULL; dht_layout_t *layout = NULL; @@ -134,7 +131,10 @@ dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie, ret = dht_layout_set (this, local->inode, layout); } - WIPE (&local->postparent); + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } DHT_STRIP_PHASE1_FLAGS (&local->stbuf); @@ -147,6 +147,256 @@ out: int +dht_discover_complete (xlator_t *this, call_frame_t *discover_frame) +{ + dht_local_t *local = NULL; + call_frame_t *main_frame = NULL; + int op_errno = 0; + int ret = -1; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + + local = discover_frame->local; + layout = local->layout; + conf = this->private; + + LOCK(&discover_frame->lock); + { + main_frame = local->main_frame; + local->main_frame = NULL; + } + UNLOCK(&discover_frame->lock); + + if (!main_frame) + return 0; + + if (local->file_count && local->dir_count) { + gf_log (this->name, GF_LOG_ERROR, + "path %s exists as a file on one subvolume " + "and directory on another. " + "Please fix it manually", + local->loc.path); + op_errno = EIO; + goto out; + } + + if (local->cached_subvol) { + ret = dht_layout_preset (this, local->cached_subvol, + local->inode); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set layout for subvolume %s", + local->cached_subvol ? local->cached_subvol->name : "<nil>"); + op_errno = EINVAL; + goto out; + } + } else { + ret = dht_layout_normalize (this, &local->loc, layout); + if ((ret < 0) || ((ret > 0) && (local->op_ret != 0))) { + /* either the layout is incorrect or the directory is + * not found even in one subvolume. + */ + gf_log (this->name, GF_LOG_DEBUG, + "normalizing failed on %s " + "(overlaps/holes present: %s, " + "ENOENT errors: %d)", local->loc.path, + (ret < 0) ? "yes" : "no", (ret > 0) ? ret : 0); + if ((ret > 0) && (ret == conf->subvolume_cnt)) { + op_errno = ESTALE; + goto out; + } + } + + if (local->inode) + dht_layout_set (this, local->inode, layout); + } + + DHT_STACK_UNWIND (lookup, main_frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); + return 0; +out: + DHT_STACK_UNWIND (lookup, main_frame, -1, op_errno, NULL, NULL, NULL, + NULL); + + return ret; +} + + +int +dht_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = -1; + int is_dir = 0; + int is_linkfile = 0; + int attempt_unwind = 0; + dht_conf_t *conf = 0; + + GF_VALIDATE_OR_GOTO ("dht", frame, out); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", this->private, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + conf = this->private; + + layout = local->layout; + + /* Check if the gfid is different for file from other node */ + if (!op_ret && uuid_compare (local->gfid, stbuf->ia_gfid)) { + gf_log (this->name, GF_LOG_WARNING, + "%s: gfid different on %s", + local->loc.path, prev->this->name); + } + + + LOCK (&frame->lock); + { + /* TODO: assert equal mode on stbuf->st_mode and + local->stbuf->st_mode + + else mkdir/chmod/chown and fix + */ + ret = dht_layout_merge (this, layout, prev->this, + op_ret, op_errno, xattr); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to merge layouts", local->loc.path); + + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_DEBUG, + "lookup of %s on %s returned error (%s)", + local->loc.path, prev->this->name, + strerror (op_errno)); + + goto unlock; + } + + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); + is_dir = check_is_dir (inode, stbuf, xattr); + + if (is_dir) { + local->dir_count ++; + } else { + local->file_count ++; + + if (!is_linkfile) { + /* real file */ + local->cached_subvol = prev->this; + attempt_unwind = 1; + } else { + goto unlock; + } + } + + local->op_ret = 0; + + if (local->xattr == NULL) { + local->xattr = dict_ref (xattr); + } else { + dht_aggregate_xattr (local->xattr, xattr); + } + + if (local->inode == NULL) + local->inode = inode_ref (inode); + + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); + dht_iatt_merge (this, &local->postparent, postparent, + prev->this); + } +unlock: + UNLOCK (&frame->lock); +out: + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt) || attempt_unwind) { + dht_discover_complete (this, frame); + } + + if (is_last_call (this_call_cnt)) + DHT_STACK_DESTROY (frame); + + return 0; +} + + +int +dht_discover (call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + int ret; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int call_cnt = 0; + int op_errno = EINVAL; + int i = 0; + call_frame_t *discover_frame = NULL; + + conf = this->private; + local = frame->local; + + ret = dict_set_uint32 (local->xattr_req, conf->xattr_name, 4 * 4); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set '%s' key", + loc->path, conf->xattr_name); + + ret = dict_set_uint32 (local->xattr_req, conf->link_xattr_name, 256); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set '%s' key", + loc->path, conf->link_xattr_name); + + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + local->layout = dht_layout_new (this, conf->subvolume_cnt); + + if (!local->layout) { + op_errno = ENOMEM; + goto err; + } + + uuid_copy (local->gfid, loc->gfid); + + discover_frame = copy_frame (frame); + if (!discover_frame) { + op_errno = ENOMEM; + goto err; + } + + discover_frame->local = local; + frame->local = NULL; + local->main_frame = frame; + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (discover_frame, dht_discover_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + &local->loc, local->xattr_req); + } + + return 0; + +err: + DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, + NULL); + + return 0; +} + + +int dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, dict_t *xattr, @@ -191,7 +441,7 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, op_ret, op_errno, xattr); if (op_ret == -1) { - local->op_errno = ENOENT; + local->op_errno = op_errno; gf_log (this->name, GF_LOG_DEBUG, "lookup of %s on %s returned error (%s)", local->loc.path, prev->this->name, @@ -250,6 +500,11 @@ unlock: dht_layout_set (this, local->inode, layout); } + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, local->xattr, @@ -260,6 +515,7 @@ unlock: selfheal: FRAME_SU_DO (frame, dht_local_t); + uuid_copy (local->loc.gfid, local->gfid); ret = dht_selfheal_directory (frame, dht_lookup_selfheal_cbk, &local->loc, layout); out: @@ -280,6 +536,8 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int ret = -1; int is_dir = 0; int is_linkfile = 0; + call_frame_t *copy = NULL; + dht_local_t *copy_local = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, err); @@ -306,7 +564,7 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, strerror (op_errno)); } if (op_errno == ESTALE) { - /* propogate the ESTALE to parent. + /* propagate the ESTALE to parent. * setting local->return_estale would send * ESTALE to parent. */ local->return_estale = 1; @@ -338,7 +596,8 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, layout = local->layout; is_dir = check_is_dir (inode, stbuf, xattr); - is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); if (is_linkfile) { gf_log (this->name, GF_LOG_INFO, @@ -350,6 +609,23 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } if (is_dir) { + ret = dht_dir_has_layout (xattr, conf->xattr_name); + if (ret >= 0) { + if (is_greater_time(local->stbuf.ia_ctime, + local->stbuf.ia_ctime_nsec, + stbuf->ia_ctime, + stbuf->ia_ctime_nsec)) { + local->prebuf.ia_gid = stbuf->ia_gid; + local->prebuf.ia_uid = stbuf->ia_uid; + } + } + if (local->stbuf.ia_type != IA_INVAL) + { + if ((local->stbuf.ia_gid != stbuf->ia_gid) || + (local->stbuf.ia_uid != stbuf->ia_uid)) { + local->need_selfheal = 1; + } + } ret = dht_layout_dir_mismatch (this, layout, prev->this, &local->loc, xattr); @@ -388,7 +664,28 @@ out: && (conf && conf->unhashed_sticky_bit)) { local->stbuf.ia_prot.sticky = 1; } - if (local->layout_mismatch) { + if (local->need_selfheal) { + local->need_selfheal = 0; + uuid_copy (local->gfid, local->stbuf.ia_gfid); + local->stbuf.ia_gid = local->prebuf.ia_gid; + local->stbuf.ia_uid = local->prebuf.ia_uid; + copy = create_frame (this, this->ctx->pool); + if (copy) { + copy_local = dht_local_init (copy, &local->loc, + NULL, 0); + if (!copy_local) + goto cont; + copy_local->stbuf = local->stbuf; + copy->local = copy_local; + FRAME_SU_DO (copy, dht_local_t); + ret = synctask_new (this->ctx->env, + dht_dir_attr_heal, + dht_dir_attr_heal_done, + copy, copy); + } + } +cont: + if (local->layout_mismatch) { /* Found layout mismatch in the directory, need to fix this in the inode context */ dht_layout_unref (this, local->layout); @@ -414,7 +711,10 @@ out: local->op_errno = ESTALE; } - WIPE (&local->postparent); + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } DHT_STRIP_PHASE1_FLAGS (&local->stbuf); DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, @@ -432,7 +732,8 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; xlator_t *cached_subvol = NULL; @@ -465,8 +766,14 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie, local->stbuf.ia_prot.sticky = 1; } + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); + } + unwind: - WIPE (&local->postparent); + if (local->linked == _gf_true) + dht_linkfile_attr_heal (frame, this); DHT_STRIP_PHASE1_FLAGS (&local->stbuf); DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, @@ -539,7 +846,10 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this) "<nil>")); } - WIPE (&local->postparent); + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } DHT_STRIP_PHASE1_FLAGS (&local->stbuf); DHT_STACK_UNWIND (lookup, frame, local->op_ret, @@ -569,7 +879,10 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this) local->op_errno = EINVAL; } - WIPE (&local->postparent); + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } DHT_STRIP_PHASE1_FLAGS (&local->stbuf); DHT_STACK_UNWIND (lookup, frame, local->op_ret, @@ -585,7 +898,7 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this) hashed_subvol->name); ret = dht_linkfile_create (frame, - dht_lookup_linkfile_create_cbk, + dht_lookup_linkfile_create_cbk, this, cached_subvol, hashed_subvol, &local->loc); return ret; @@ -595,7 +908,8 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this) int dht_lookup_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { int this_call_cnt = 0; @@ -622,8 +936,9 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, xlator_t *subvol = NULL; loc_t *loc = NULL; xlator_t *link_subvol = NULL; - int ret = -1; - int32_t fd_count = 0; + int ret = -1; + int32_t fd_count = 0; + dht_conf_t *conf = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, out); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -633,6 +948,7 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; loc = &local->loc; + conf = this->private; prev = cookie; subvol = prev->this; @@ -654,7 +970,8 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, loc->path, prev->this->name); } - is_linkfile = check_is_linkfile (inode, buf, xattr); + is_linkfile = check_is_linkfile (inode, buf, xattr, + conf->link_xattr_name); is_dir = check_is_dir (inode, buf, xattr); if (is_linkfile) { @@ -714,7 +1031,7 @@ unlock: "deleting stale linkfile %s on %s", loc->path, subvol->name); STACK_WIND (frame, dht_lookup_unlink_cbk, - subvol, subvol->fops->unlink, loc); + subvol, subvol->fops->unlink, loc, 0, NULL); return 0; } } @@ -796,7 +1113,16 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, gf_log (this->name, GF_LOG_INFO, "lookup of %s on %s (following linkfile) failed (%s)", local->loc.path, subvol->name, strerror (op_errno)); - goto err; + + /* If cached subvol returned ENOTCONN, do not do + lookup_everywhere. We need to make sure linkfile does not get + removed, which can take away the namespace, and subvol is + anyways down. */ + + if (op_errno != ENOTCONN) + goto err; + else + goto unwind; } if (check_is_dir (inode, stbuf, xattr)) { @@ -806,7 +1132,7 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, goto err; } - if (check_is_linkfile (inode, stbuf, xattr)) { + if (check_is_linkfile (inode, stbuf, xattr, conf->link_xattr_name)) { gf_log (this->name, GF_LOG_INFO, "lookup of %s on %s (following linkfile) reached link", local->loc.path, subvol->name); @@ -834,9 +1160,12 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, op_errno = EINVAL; } -unwind: - WIPE (postparent); + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); + } +unwind: DHT_STRIP_PHASE1_FLAGS (stbuf); DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, xattr, postparent); @@ -918,7 +1247,6 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, loc_t *loc = NULL; call_frame_t *prev = NULL; int ret = 0; - uint64_t tmp_layout = 0; dht_layout_t *parent_layout = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); @@ -949,8 +1277,10 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } if ((conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) && (loc->parent)) { - ret = inode_ctx_get (loc->parent, this, &tmp_layout); - parent_layout = (dht_layout_t *)(long)tmp_layout; + ret = dht_inode_ctx_layout_get (loc->parent, this, + &parent_layout); + if (ret || !parent_layout) + goto out; if (parent_layout->search_unhashed) { local->op_errno = ENOENT; dht_lookup_everywhere (frame, this, loc); @@ -979,7 +1309,8 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); if (!is_linkfile) { /* non-directory and not a linkfile */ @@ -1019,7 +1350,10 @@ out: * from each of the subvolume. See dht_iatt_merge for reference. */ - WIPE (postparent); + if (!op_ret && local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); + } DHT_STRIP_PHASE1_FLAGS (stbuf); DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, xattr, @@ -1028,6 +1362,39 @@ err: return 0; } +/* For directories, check if acl xattrs have been requested (by the acl xlator), + * if not, request for them. These xattrs are needed for dht dir self-heal to + * perform proper self-healing of dirs + */ +void +dht_check_and_set_acl_xattr_req (inode_t *inode, dict_t *xattr_req) +{ + int ret = 0; + + GF_ASSERT (inode); + GF_ASSERT (xattr_req); + + if (inode->ia_type != IA_IFDIR) + return; + + if (!dict_get (xattr_req, POSIX_ACL_ACCESS_XATTR)) { + ret = dict_set_int8 (xattr_req, POSIX_ACL_ACCESS_XATTR, 0); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set key %s", + POSIX_ACL_ACCESS_XATTR); + } + + if (!dict_get (xattr_req, POSIX_ACL_DEFAULT_XATTR)) { + ret = dict_set_int8 (xattr_req, POSIX_ACL_DEFAULT_XATTR, 0); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set key %s", + POSIX_ACL_DEFAULT_XATTR); + } + + return; +} int dht_lookup (call_frame_t *frame, xlator_t *this, @@ -1048,7 +1415,6 @@ dht_lookup (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); conf = this->private; if (!conf) @@ -1085,6 +1451,12 @@ dht_lookup (call_frame_t *frame, xlator_t *this, local->xattr_req = dict_new (); } + if (uuid_is_null (loc->pargfid) && !uuid_is_null (loc->gfid) && + !__is_root_gfid (loc->inode->gfid)) { + local->cached_subvol = NULL; + dht_discover (frame, this, loc); + return 0; + } if (!hashed_subvol) hashed_subvol = dht_subvol_get_hashed (this, loc); @@ -1117,7 +1489,7 @@ dht_lookup (call_frame_t *frame, xlator_t *this, * revalidates directly go to the cached-subvolume. */ ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); if (IA_ISDIR (local->inode->ia_type)) { local->call_cnt = call_cnt = conf->subvolume_cnt; @@ -1137,30 +1509,34 @@ dht_lookup (call_frame_t *frame, xlator_t *this, ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_OPEN_FD_COUNT, 4); - for (i = 0; i < layout->cnt; i++) { + /* need it for dir self-heal */ + dht_check_and_set_acl_xattr_req (loc->inode, local->xattr_req); + + for (i = 0; i < call_cnt; i++) { subvol = layout->list[i].xlator; STACK_WIND (frame, dht_revalidate_cbk, subvol, subvol->fops->lookup, &local->loc, local->xattr_req); - if (!--call_cnt) - break; } } else { do_fresh_lookup: /* TODO: remove the hard-coding */ ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); ret = dict_set_uint32 (local->xattr_req, - DHT_LINKFILE_KEY, 256); + conf->link_xattr_name, 256); /* need it for self-healing linkfiles which is 'in-migration' state */ ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_OPEN_FD_COUNT, 4); + /* need it for dir self-heal */ + dht_check_and_set_acl_xattr_req (loc->inode, local->xattr_req); + if (!hashed_subvol) { gf_log (this->name, GF_LOG_DEBUG, "no subvolume in layout for path=%s, " @@ -1194,7 +1570,8 @@ dht_lookup (call_frame_t *frame, xlator_t *this, err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, + NULL); return 0; } @@ -1202,7 +1579,7 @@ err: int dht_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -1226,14 +1603,18 @@ dht_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->postparent = *postparent; local->preparent = *preparent; - WIPE (&local->postparent); - WIPE (&local->preparent); + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + &local->preparent, 0); + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); + } } unlock: UNLOCK (&frame->lock); DHT_STACK_UNWIND (unlink, frame, local->op_ret, local->op_errno, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, NULL); return 0; } @@ -1242,7 +1623,7 @@ unlock: int dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -1254,7 +1635,8 @@ dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, LOCK (&frame->lock); { - if (op_ret == -1) { + if ((op_ret == -1) && !((op_errno == ENOENT) || + (op_errno == ENOTCONN))) { local->op_errno = op_errno; gf_log (this->name, GF_LOG_DEBUG, "subvolume %s returned -1 (%s)", @@ -1267,7 +1649,7 @@ dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, unlock: UNLOCK (&frame->lock); - if (op_ret == -1) + if (local->op_ret == -1) goto err; cached_subvol = dht_subvol_get_cached (this, local->loc.inode); @@ -1281,53 +1663,19 @@ unlock: STACK_WIND (frame, dht_unlink_cbk, cached_subvol, cached_subvol->fops->unlink, - &local->loc); + &local->loc, local->flags, NULL); return 0; err: DHT_STACK_UNWIND (unlink, frame, -1, local->op_errno, - NULL, NULL); + NULL, NULL, NULL); return 0; } -static int -dht_ufo_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) -{ - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_ret = -1; - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } - } -unlock: - UNLOCK (&frame->lock); - - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (setxattr, frame, local->op_ret, local->op_errno); - } - - return 0; -} - - int dht_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; @@ -1353,7 +1701,8 @@ unlock: this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (setxattr, frame, local->op_ret, local->op_errno); + DHT_STACK_UNWIND (setxattr, frame, local->op_ret, + local->op_errno, NULL); } return 0; @@ -1376,114 +1725,249 @@ fill_layout_info (dht_layout_t *layout, char *buf) } } +void +dht_fill_pathinfo_xattr (xlator_t *this, dht_local_t *local, + char *xattr_buf, int32_t alloc_len, + int flag, char *layout_buf) +{ + if (flag && local->xattr_val) + snprintf (xattr_buf, alloc_len, + "((<"DHT_PATHINFO_HEADER"%s> %s) (%s-layout %s))", + this->name, local->xattr_val, this->name, + layout_buf); + else if (local->xattr_val) + snprintf (xattr_buf, alloc_len, + "(<"DHT_PATHINFO_HEADER"%s> %s)", + this->name, local->xattr_val); + else if (flag) + snprintf (xattr_buf, alloc_len, "(%s-layout %s)", + this->name, layout_buf); +} + int -dht_pathinfo_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr) +dht_vgetxattr_alloc_and_fill (dht_local_t *local, dict_t *xattr, xlator_t *this, + int op_errno) +{ + int ret = -1; + char *value = NULL; + int32_t plen = 0; + + ret = dict_get_str (xattr, local->xsel, &value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Subvolume %s returned -1 (%s)", this->name, + strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + goto out; + } + + local->alloc_len += strlen(value); + + if (!local->xattr_val) { + local->alloc_len += (strlen (DHT_PATHINFO_HEADER) + 10); + local->xattr_val = GF_CALLOC (local->alloc_len, sizeof (char), + gf_common_mt_char); + if (!local->xattr_val) { + ret = -1; + goto out; + } + } + + if (local->xattr_val) { + plen = strlen (local->xattr_val); + if (plen) { + /* extra byte(s) for \0 to be safe */ + local->alloc_len += (plen + 2); + local->xattr_val = GF_REALLOC (local->xattr_val, + local->alloc_len); + if (!local->xattr_val) { + ret = -1; + goto out; + } + } + + (void) strcat (local->xattr_val, value); + (void) strcat (local->xattr_val, " "); + local->op_ret = 0; + } + + ret = 0; + + out: + return ret; +} + +int +dht_vgetxattr_fill_and_set (dht_local_t *local, dict_t **dict, xlator_t *this, + gf_boolean_t flag) +{ + int ret = -1; + char *xattr_buf = NULL; + char layout_buf[8192] = {0,}; + + if (flag) + fill_layout_info (local->layout, layout_buf); + + *dict = dict_new (); + if (!*dict) + goto out; + + local->xattr_val[strlen (local->xattr_val) - 1] = '\0'; + + /* we would need max this many bytes to create xattr string + * extra 40 bytes is just an estimated amount of additional + * space required as we include translator name and some + * spaces, brackets etc. when forming the pathinfo string. + * + * For node-uuid we just don't have all the pretty formatting, + * but since this is a generic routine for pathinfo & node-uuid + * we dont have conditional space allocation and try to be + * generic + */ + local->alloc_len += (2 * strlen (this->name)) + + strlen (layout_buf) + + 40; + xattr_buf = GF_CALLOC (local->alloc_len, sizeof (char), + gf_common_mt_char); + if (!xattr_buf) + goto out; + + if (XATTR_IS_PATHINFO (local->xsel)) { + (void) dht_fill_pathinfo_xattr (this, local, xattr_buf, + local->alloc_len, flag, + layout_buf); + } else if (XATTR_IS_NODE_UUID (local->xsel)) { + (void) snprintf (xattr_buf, local->alloc_len, "%s", + local->xattr_val); + } else { + gf_log (this->name, GF_LOG_WARNING, + "Unknown local->xsel (%s)", local->xsel); + goto out; + } + + ret = dict_set_dynstr (*dict, local->xsel, xattr_buf); + GF_FREE (local->xattr_val); + + out: + return ret; +} + +int +dht_vgetxattr_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - dht_local_t *local = NULL; int ret = 0; - int flag = 0; + dht_local_t *local = NULL; int this_call_cnt = 0; - char *value_got = NULL; - char layout_buf[8192] = {0,}; - char *xattr_buf = NULL; dict_t *dict = NULL; - int32_t alloc_len = 0; - int32_t plen = 0; - local = frame->local; + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (frame->local, out); - if (op_ret != -1) { - ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &value_got); - if (!ret) { - alloc_len = strlen (value_got); + local = frame->local; - /** - * allocate the buffer:- we allocate 10 bytes extra in case we need to - * append ' Link: ' in the buffer for another STACK_WIND - */ - if (!local->pathinfo) { - alloc_len += (strlen (DHT_PATHINFO_HEADER) + 10); - local->pathinfo = GF_CALLOC (alloc_len, sizeof (char), gf_common_mt_char); + LOCK (&frame->lock); + { + this_call_cnt = --local->call_cnt; + if (op_ret < 0) { + if (op_errno != ENOTCONN) { + gf_log (this->name, GF_LOG_ERROR, + "getxattr err (%s) for dir", + strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; } - if (local->pathinfo) { - plen = strlen (local->pathinfo); - if (plen) { - /* extra byte(s) for \0 to be safe */ - alloc_len += (plen + 2); - local->pathinfo = GF_REALLOC (local->pathinfo, - alloc_len); - if (!local->pathinfo) - goto out; - } - - strcat (local->pathinfo, value_got); - } + goto unlock; } + + ret = dht_vgetxattr_alloc_and_fill (local, xattr, this, + op_errno); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "alloc or fill failure"); } + unlock: + UNLOCK (&frame->lock); - out: - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - if (local->layout->cnt > 1) { - /* Set it for directory */ - fill_layout_info (local->layout, layout_buf); - flag = 1; - } + if (!is_last_call (this_call_cnt)) + goto out; - dict = dict_new (); + /* -- last call: do patch ups -- */ - /* we would need max-to-max this many bytes to create pathinfo string */ - alloc_len += (2 * strlen (this->name)) + strlen (layout_buf) + 40; - xattr_buf = GF_CALLOC (alloc_len, sizeof (char), gf_common_mt_char); + if (local->op_ret == -1) { + goto unwind; + } - if (flag && local->pathinfo) - snprintf (xattr_buf, alloc_len, "((<"DHT_PATHINFO_HEADER"%s> %s) (%s-layout %s))", - this->name, local->pathinfo, this->name, - layout_buf); - else if (local->pathinfo) - snprintf (xattr_buf, alloc_len, "(<"DHT_PATHINFO_HEADER"%s> %s)", - this->name, local->pathinfo); - else if (flag) - snprintf (xattr_buf, alloc_len, "(%s-layout %s)", - this->name, layout_buf); + ret = dht_vgetxattr_fill_and_set (local, &dict, this, _gf_true); + if (ret) + goto unwind; - ret = dict_set_dynstr (dict, GF_XATTR_PATHINFO_KEY, - xattr_buf); + DHT_STACK_UNWIND (getxattr, frame, 0, 0, dict, xdata); + goto cleanup; - if (local->pathinfo) - GF_FREE (local->pathinfo); + unwind: + DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, NULL, NULL); + cleanup: + if (dict) + dict_unref (dict); + out: + return 0; +} - DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); +int +dht_vgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + dht_local_t *local = NULL; + int ret = 0; + dict_t *dict = NULL; + call_frame_t *prev = NULL; + gf_boolean_t flag = _gf_true; - if (dict) - dict_unref (dict); + local = frame->local; + prev = cookie; - return 0; + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, "Subvolume %s returned -1 " + "(%s)", prev->this->name, strerror (op_errno)); + goto unwind; } - if (local->pathinfo) - strcat (local->pathinfo, " Link: "); - if (local->hashed_subvol) { - /* This will happen if there pending */ - STACK_WIND (frame, dht_pathinfo_getxattr_cbk, local->hashed_subvol, - local->hashed_subvol->fops->getxattr, - &local->loc, local->key); - - return 0; + ret = dht_vgetxattr_alloc_and_fill (local, xattr, this, + op_errno); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "alloc or fill failure"); + goto unwind; } - gf_log ("this->name", GF_LOG_ERROR, "Unable to find hashed_subvol for path" - " %s", local->pathinfo); + flag = (local->layout->cnt > 1) ? _gf_true : _gf_false; + + ret = dht_vgetxattr_fill_and_set (local, &dict, this, flag); + if (ret) + goto unwind; + + DHT_STACK_UNWIND (getxattr, frame, 0, 0, dict, xdata); + goto cleanup; + + unwind: + DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, + NULL, NULL); + cleanup: + if (dict) + dict_unref (dict); - DHT_STACK_UNWIND (getxattr, frame, -1, op_errno, dict); return 0; } int dht_linkinfo_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr) + int op_ret, int op_errno, dict_t *xattr, + dict_t *xdata) { int ret = 0; char *value = NULL; @@ -1498,21 +1982,24 @@ dht_linkinfo_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } } - DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr); + DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata); return 0; } int dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr) + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { int this_call_cnt = 0; dht_local_t *local = NULL; + dht_conf_t *conf = NULL; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (frame->local, out); + VALIDATE_OR_GOTO (this->private, out); + conf = this->private; local = frame->local; this_call_cnt = dht_frame_return (frame); @@ -1520,8 +2007,8 @@ dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (!xattr || (op_ret == -1)) goto out; - if (dict_get (xattr, "trusted.glusterfs.dht")) { - dict_del (xattr, "trusted.glusterfs.dht"); + if (dict_get (xattr, conf->xattr_name)) { + dict_del (xattr, conf->xattr_name); } local->op_ret = 0; @@ -1538,24 +2025,88 @@ dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } out: if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno, local->xattr); + DHT_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno, + local->xattr, NULL); } return 0; } int32_t dht_getxattr_unwind (call_frame_t *frame, - int op_ret, int op_errno, dict_t *dict) + int op_ret, int op_errno, dict_t *dict, dict_t *xdata) { - DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); + DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); return 0; } int +dht_getxattr_get_real_filename_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + dict_t *xattr, dict_t *xdata) +{ + int this_call_cnt = 0; + dht_local_t *local = NULL; + + + local = frame->local; + + if (op_ret != -1) { + if (local->xattr) + dict_unref (local->xattr); + local->xattr = dict_ref (xattr); + + if (local->xattr_req) + dict_unref (local->xattr_req); + local->xattr_req = dict_ref (xdata); + } + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + DHT_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno, + local->xattr, local->xattr_req); + } + + return 0; +} + + +int +dht_getxattr_get_real_filename (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *key, dict_t *xdata) +{ + dht_local_t *local = NULL; + int i = 0; + dht_layout_t *layout = NULL; + int cnt = 0; + xlator_t *subvol = NULL; + + + local = frame->local; + layout = local->layout; + + cnt = local->call_cnt = layout->cnt; + + local->op_ret = -1; + local->op_errno = ENODATA; + + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND (frame, dht_getxattr_get_real_filename_cbk, + subvol, subvol->fops->getxattr, + loc, key, xdata); + } + + return 0; +} + + +int dht_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *key) + loc_t *loc, const char *key, dict_t *xdata) +#define DHT_IS_DIR(layout) (layout->cnt > 1) { + xlator_t *subvol = NULL; xlator_t *hashed_subvol = NULL; xlator_t *cached_subvol = NULL; @@ -1571,7 +2122,6 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); VALIDATE_OR_GOTO (this->private, err); conf = this->private; @@ -1599,24 +2149,67 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, } } - if (key && (strcmp (key, GF_XATTR_PATHINFO_KEY) == 0)) { - hashed_subvol = dht_subvol_get_hashed (this, loc); + if (key && + (strncmp (key, GF_XATTR_GET_REAL_FILENAME_KEY, + strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0) + && DHT_IS_DIR(layout)) { + dht_getxattr_get_real_filename (frame, this, loc, key, xdata); + return 0; + } + + /* for file use cached subvolume (obviously!): see if {} + * below + * for directory: + * wind to all subvolumes and exclude subvolumes which + * return ENOTCONN (in callback) + * + * NOTE: Don't trust inode here, as that may not be valid + * (until inode_link() happens) + */ + if (key && DHT_IS_DIR(layout) && + ((strcmp (key, GF_XATTR_PATHINFO_KEY) == 0) + || (strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0))) { + (void) strncpy (local->xsel, key, 256); + cnt = local->call_cnt = layout->cnt; + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND (frame, dht_vgetxattr_dir_cbk, + subvol, subvol->fops->getxattr, + loc, key, NULL); + } + return 0; + } + + /* node-uuid or pathinfo for files */ + if (key && ((strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0) + || (strcmp (key, GF_XATTR_PATHINFO_KEY) == 0))) { cached_subvol = local->cached_subvol; + (void) strncpy (local->xsel, key, 256); local->call_cnt = 1; - if (hashed_subvol != cached_subvol) { - local->call_cnt = 2; - local->hashed_subvol = hashed_subvol; - } - - STACK_WIND (frame, dht_pathinfo_getxattr_cbk, cached_subvol, - cached_subvol->fops->getxattr, loc, key); + STACK_WIND (frame, dht_vgetxattr_cbk, cached_subvol, + cached_subvol->fops->getxattr, loc, key, NULL); return 0; } + if (key && (strcmp (key, GF_XATTR_LINKINFO_KEY) == 0)) { hashed_subvol = dht_subvol_get_hashed (this, loc); + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get" + "hashed subvol for %s", loc->path); + op_errno = EINVAL; + goto err; + } + cached_subvol = dht_subvol_get_cached (this, loc->inode); + if (!cached_subvol) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get" + "cached subvol for %s", loc->path); + op_errno = EINVAL; + goto err; + } + if (hashed_subvol == cached_subvol) { op_errno = ENODATA; goto err; @@ -1624,7 +2217,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, if (hashed_subvol) { STACK_WIND (frame, dht_linkinfo_getxattr_cbk, hashed_subvol, hashed_subvol->fops->getxattr, loc, - GF_XATTR_PATHINFO_KEY); + GF_XATTR_PATHINFO_KEY, NULL); return 0; } op_errno = ENODATA; @@ -1632,13 +2225,13 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, } if (key && (!strcmp (GF_XATTR_MARKER_KEY, key)) - && (-1 == frame->root->pid)) { - - if (loc->inode-> ia_type == IA_IFDIR) { + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { + if (DHT_IS_DIR(layout)) { cnt = layout->cnt; } else { cnt = 1; } + sub_volumes = alloca ( cnt * sizeof (xlator_t *)); for (i = 0; i < cnt; i++) *(sub_volumes + i) = layout->list[i].xlator; @@ -1646,7 +2239,8 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, if (cluster_getmarkerattr (frame, this, loc, key, local, dht_getxattr_unwind, sub_volumes, cnt, - MARKER_UUID_TYPE, conf->vol_uuid)) { + MARKER_UUID_TYPE, marker_uuid_default_gauge, + conf->vol_uuid)) { op_errno = EINVAL; goto err; } @@ -1656,8 +2250,8 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, if (key && *conf->vol_uuid) { if ((match_uuid_local (key, conf->vol_uuid) == 0) && - (-1 == frame->root->pid)) { - if (loc->inode-> ia_type == IA_IFDIR) { + (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { + if (DHT_IS_DIR(layout)) { cnt = layout->cnt; } else { cnt = 1; @@ -1670,6 +2264,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, local, dht_getxattr_unwind, sub_volumes, cnt, MARKER_XTIME_TYPE, + marker_xtime_default_gauge, conf->vol_uuid)) { op_errno = EINVAL; goto err; @@ -1679,7 +2274,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, } } - if (loc->inode-> ia_type == IA_IFDIR) { + if (DHT_IS_DIR(layout)) { cnt = local->call_cnt = layout->cnt; } else { cnt = local->call_cnt = 1; @@ -1689,29 +2284,100 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, subvol = layout->list[i].xlator; STACK_WIND (frame, dht_getxattr_cbk, subvol, subvol->fops->getxattr, - loc, key); + loc, key, NULL); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); + + return 0; +} +#undef DHT_IS_DIR + +int +dht_fgetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *key, dict_t *xdata) +{ + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int op_errno = -1; + int i = 0; + int cnt = 0; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + VALIDATE_OR_GOTO (this->private, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FGETXATTR); + if (!local) { + op_errno = ENOMEM; + + goto err; + } + + layout = local->layout; + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "layout is NULL"); + op_errno = ENOENT; + goto err; + } + + if (key) { + local->key = gf_strdup (key); + if (!local->key) { + op_errno = ENOMEM; + goto err; + } + } + + if ((fd->inode->ia_type == IA_IFDIR) + && (strncmp (key, GF_XATTR_LOCKINFO_KEY, + strlen (GF_XATTR_LOCKINFO_KEY) != 0))) { + cnt = local->call_cnt = layout->cnt; + } else { + cnt = local->call_cnt = 1; + } + + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND (frame, dht_getxattr_cbk, + subvol, subvol->fops->fgetxattr, + fd, key, NULL); + } + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fgetxattr, frame, -1, op_errno, NULL, NULL); return 0; } int dht_fsetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, dict_t *xattr, int flags) + fd_t *fd, dict_t *xattr, int flags, dict_t *xdata) { xlator_t *subvol = NULL; dht_local_t *local = NULL; int op_errno = EINVAL; + dht_conf_t *conf = NULL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); VALIDATE_OR_GOTO (fd->inode, err); + VALIDATE_OR_GOTO (this->private, err); + + conf = this->private; + + GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, + op_errno, err); local = dht_local_init (frame, NULL, fd, GF_FOP_FSETXATTR); if (!local) { @@ -1730,13 +2396,13 @@ dht_fsetxattr (call_frame_t *frame, xlator_t *this, local->call_cnt = 1; STACK_WIND (frame, dht_err_cbk, subvol, subvol->fops->fsetxattr, - fd, xattr, flags); + fd, xattr, flags, NULL); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fsetxattr, frame, -1, op_errno); + DHT_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); return 0; } @@ -1744,16 +2410,18 @@ err: static int dht_common_setxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xdata) { - DHT_STACK_UNWIND (setxattr, frame, op_ret, op_errno); + DHT_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); return 0; } int dht_checking_pathinfo_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr) + int op_ret, int op_errno, dict_t *xattr, + dict_t *xdata) { int i = -1; int ret = -1; @@ -1785,7 +2453,7 @@ dht_checking_pathinfo_cbk (call_frame_t *frame, void *cookie, xlator_t *this, out: this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (setxattr, frame, local->op_ret, ENOTSUP); + DHT_STACK_UNWIND (setxattr, frame, local->op_ret, ENOTSUP, NULL); } return 0; @@ -1793,7 +2461,7 @@ out: int dht_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr, int flags) + loc_t *loc, dict_t *xattr, int flags, dict_t *xdata) { xlator_t *subvol = NULL; dht_local_t *local = NULL; @@ -1805,16 +2473,19 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, data_t *tmp = NULL; uint32_t dir_spread = 0; char value[4096] = {0,}; - int forced_rebalance = 0; - + gf_dht_migrate_data_type_t forced_rebalance = GF_DHT_MIGRATE_DATA; + int call_cnt = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); conf = this->private; + + GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, + op_errno, err); + local = dht_local_init (frame, loc, NULL, GF_FOP_SETXATTR); if (!local) { op_errno = ENOMEM; @@ -1837,26 +2508,7 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, goto err; } - local->call_cnt = layout->cnt; - - /* This key is sent by Unified File and Object storage - * to test xattr support in backend. - */ - tmp = dict_get (xattr, "user.ufo-test"); - if (tmp) { - if (IA_ISREG (loc->inode->ia_type)) { - op_errno = ENOTSUP; - goto err; - } - local->op_ret = 0; - for (i = 0; i < layout->cnt; i++) { - STACK_WIND (frame, dht_ufo_xattr_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->setxattr, - loc, xattr, flags); - } - return 0; - } + local->call_cnt = call_cnt = layout->cnt; tmp = dict_get (xattr, "distribute.migrate-data"); if (tmp) { @@ -1869,9 +2521,20 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, (ie, 'target' subvolume given there, etc) */ memcpy (value, tmp->data, tmp->len); if (strcmp (value, "force") == 0) - forced_rebalance = 1; + forced_rebalance = + GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS; + + if (conf->decommission_in_progress) + forced_rebalance = GF_DHT_MIGRATE_HARDLINK; local->rebalance.target_node = dht_subvol_get_hashed (this, loc); + if (!local->rebalance.target_node) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "hashed subvol for %s", loc->path); + op_errno = EINVAL; + goto err; + } + local->rebalance.from_subvol = local->cached_subvol; if (local->rebalance.target_node == local->rebalance.from_subvol) { @@ -1911,7 +2574,7 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, dht_checking_pathinfo_cbk, conf->subvolumes[i], conf->subvolumes[i]->fops->getxattr, - loc, GF_XATTR_PATHINFO_KEY); + loc, GF_XATTR_PATHINFO_KEY, NULL); } return 0; } @@ -1921,9 +2584,13 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, gf_log (this->name, GF_LOG_INFO, "fixing the layout of %s", loc->path); - dht_fix_directory_layout (frame, dht_common_setxattr_cbk, - layout); - return 0; + ret = dht_fix_directory_layout (frame, dht_common_setxattr_cbk, + layout); + if (ret) { + op_errno = ENOTCONN; + goto err; + } + return ret; } tmp = dict_get (xattr, "distribute.directory-spread-count"); @@ -1935,10 +2602,14 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, (dir_spread > 0))) { layout->spread_cnt = dir_spread; - dht_fix_directory_layout (frame, - dht_common_setxattr_cbk, - layout); - return 0; + ret = dht_fix_directory_layout (frame, + dht_common_setxattr_cbk, + layout); + if (ret) { + op_errno = ENOTCONN; + goto err; + } + return ret; } gf_log (this->name, GF_LOG_ERROR, "wrong 'directory-spread-count' value (%s)", value); @@ -1946,18 +2617,18 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, goto err; } - for (i = 0; i < layout->cnt; i++) { + for (i = 0; i < call_cnt; i++) { STACK_WIND (frame, dht_err_cbk, layout->list[i].xlator, layout->list[i].xlator->fops->setxattr, - loc, xattr, flags); + loc, xattr, flags, xdata); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (setxattr, frame, -1, op_errno); + DHT_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); return 0; } @@ -1965,7 +2636,7 @@ err: int dht_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; @@ -1991,7 +2662,8 @@ unlock: this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (removexattr, frame, local->op_ret, local->op_errno); + DHT_STACK_UNWIND (removexattr, frame, local->op_ret, + local->op_errno, NULL); } return 0; @@ -2000,20 +2672,27 @@ unlock: int dht_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *key) + loc_t *loc, const char *key, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; dht_local_t *local = NULL; dht_layout_t *layout = NULL; + int call_cnt = 0; + dht_conf_t *conf = NULL; int i; - VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (this->private, err); + + conf = this->private; + + GF_IF_NATIVE_XATTR_GOTO (conf->wild_xattr_name, key, op_errno, err); + + VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); local = dht_local_init (frame, loc, NULL, GF_FOP_REMOVEXATTR); if (!local) { @@ -2037,21 +2716,85 @@ dht_removexattr (call_frame_t *frame, xlator_t *this, goto err; } - local->call_cnt = layout->cnt; + local->call_cnt = call_cnt = layout->cnt; local->key = gf_strdup (key); - for (i = 0; i < layout->cnt; i++) { + for (i = 0; i < call_cnt; i++) { STACK_WIND (frame, dht_removexattr_cbk, layout->list[i].xlator, layout->list[i].xlator->fops->removexattr, - loc, key); + loc, key, NULL); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (removexattr, frame, -1, op_errno); + DHT_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); + + return 0; +} + +int +dht_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *key, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int call_cnt = 0; + dht_conf_t *conf = 0; + + int i; + + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (this->private, err); + + conf = this->private; + + GF_IF_NATIVE_XATTR_GOTO (conf->wild_xattr_name, key, op_errno, err); + + VALIDATE_OR_GOTO (frame, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FREMOVEXATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for inode=%s", + uuid_utoa (fd->inode->gfid)); + op_errno = EINVAL; + goto err; + } + + layout = local->layout; + if (!local->layout) { + gf_log (this->name, GF_LOG_DEBUG, + "no layout for inode=%s", uuid_utoa (fd->inode->gfid)); + op_errno = EINVAL; + goto err; + } + + local->call_cnt = call_cnt = layout->cnt; + local->key = gf_strdup (key); + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_removexattr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->fremovexattr, + fd, key, NULL); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fremovexattr, frame, -1, op_errno, NULL); return 0; } @@ -2059,7 +2802,7 @@ err: int dht_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd) + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; @@ -2086,7 +2829,7 @@ unlock: this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) DHT_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, - local->fd); + local->fd, NULL); return 0; } @@ -2116,7 +2859,7 @@ dht_normalize_stats (struct statvfs *buf, unsigned long bsize, int dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct statvfs *statvfs) + int op_ret, int op_errno, struct statvfs *statvfs, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; @@ -2162,14 +2905,14 @@ unlock: this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) DHT_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno, - &local->statvfs); + &local->statvfs, xdata); return 0; } int -dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) +dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { xlator_t *subvol = NULL; dht_local_t *local = NULL; @@ -2181,7 +2924,6 @@ dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); VALIDATE_OR_GOTO (this->private, err); conf = this->private; @@ -2198,7 +2940,8 @@ dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) for (i = 0; i < conf->subvolume_cnt; i++) { STACK_WIND (frame, dht_statfs_cbk, conf->subvolumes[i], - conf->subvolumes[i]->fops->statfs, loc); + conf->subvolumes[i]->fops->statfs, loc, + xdata); } return 0; } @@ -2214,20 +2957,21 @@ dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) local->call_cnt = 1; STACK_WIND (frame, dht_statfs_cbk, - subvol, subvol->fops->statfs, loc); + subvol, subvol->fops->statfs, loc, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (statfs, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); return 0; } int -dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) +dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; @@ -2254,14 +2998,14 @@ dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) STACK_WIND (frame, dht_fd_cbk, conf->subvolumes[i], conf->subvolumes[i]->fops->opendir, - loc, fd); + loc, fd, xdata); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (opendir, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (opendir, frame, -1, op_errno, NULL, NULL); return 0; } @@ -2269,7 +3013,7 @@ err: int dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, - int op_errno, gf_dirent_t *orig_entries) + int op_errno, gf_dirent_t *orig_entries, dict_t *xdata) { dht_local_t *local = NULL; gf_dirent_t entries; @@ -2282,6 +3026,7 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, dht_layout_t *layout = 0; dht_conf_t *conf = NULL; xlator_t *subvol = 0; + int ret = 0; INIT_LIST_HEAD (&entries.list); prev = cookie; @@ -2298,10 +3043,13 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, list_for_each_entry (orig_entry, (&orig_entries->list), list) { next_offset = orig_entry->d_off; - - if (check_is_linkfile_wo_dict (NULL, (&orig_entry->d_stat)) - || (check_is_dir (NULL, (&orig_entry->d_stat), NULL) - && (prev->this != dht_first_up_subvol (this)))) { + if (check_is_dir (NULL, (&orig_entry->d_stat), NULL) && + (prev->this != local->first_up_subvol)) { + continue; + } + if (check_is_linkfile (NULL, (&orig_entry->d_stat), + orig_entry->dict, + conf->link_xattr_name)) { continue; } @@ -2317,7 +3065,7 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, orig_entry->d_name); if (!subvol || (subvol != prev->this)) { /* TODO: Count the number of entries which need - linkfile to prove its existance in fs */ + linkfile to prove its existence in fs */ layout->search_unhashed++; } } @@ -2330,6 +3078,24 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, entry->d_type = orig_entry->d_type; entry->d_len = orig_entry->d_len; + if (orig_entry->dict) + entry->dict = dict_ref (orig_entry->dict); + + /* making sure we set the inode ctx right with layout, + currently possible only for non-directories, so for + directories don't set entry inodes */ + if (!IA_ISDIR(entry->d_stat.ia_type)) { + ret = dht_layout_preset (this, prev->this, + orig_entry->inode); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "failed to link the layout in inode"); + entry->inode = inode_ref (orig_entry->inode); + } else if (orig_entry->inode) { + dht_inode_ctx_time_update (orig_entry->inode, this, + &entry->d_stat, 1); + } + list_add_tail (&entry->list, &entries.list); count++; } @@ -2359,9 +3125,23 @@ done: goto unwind; } + if (conf->readdir_optimize == _gf_true) { + if (next_subvol != local->first_up_subvol) { + ret = dict_set_int32 (local->xattr, + GF_READDIR_SKIP_DIRS, 1); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "dict set failed"); + } else { + dict_del (local->xattr, + GF_READDIR_SKIP_DIRS); + } + } + STACK_WIND (frame, dht_readdirp_cbk, next_subvol, next_subvol->fops->readdirp, - local->fd, local->size, next_offset); + local->fd, local->size, next_offset, + local->xattr); return 0; } @@ -2369,7 +3149,7 @@ unwind: if (op_ret < 0) op_ret = 0; - DHT_STACK_UNWIND (readdirp, frame, op_ret, op_errno, &entries); + DHT_STACK_UNWIND (readdirp, frame, op_ret, op_errno, &entries, NULL); gf_dirent_free (&entries); @@ -2380,7 +3160,8 @@ unwind: int dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *orig_entries) + int op_ret, int op_errno, gf_dirent_t *orig_entries, + dict_t *xdata) { dht_local_t *local = NULL; gf_dirent_t entries; @@ -2457,7 +3238,7 @@ done: STACK_WIND (frame, dht_readdir_cbk, next_subvol, next_subvol->fops->readdir, - local->fd, local->size, next_offset); + local->fd, local->size, next_offset, NULL); return 0; } @@ -2465,7 +3246,7 @@ unwind: if (op_ret < 0) op_ret = 0; - DHT_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries); + DHT_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries, NULL); gf_dirent_free (&entries); @@ -2475,17 +3256,21 @@ unwind: int dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t yoff, int whichop) + off_t yoff, int whichop, dict_t *dict) { dht_local_t *local = NULL; int op_errno = -1; xlator_t *xvol = NULL; off_t xoff = 0; - + int ret = 0; + dht_conf_t *conf = NULL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (this->private, err); + + conf = this->private; local = dht_local_init (frame, NULL, NULL, whichop); if (!local) { @@ -2495,22 +3280,52 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, local->fd = fd_ref (fd); local->size = size; + local->xattr_req = (dict)? dict_ref (dict) : NULL; + local->first_up_subvol = dht_first_up_subvol (this); dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff); /* TODO: do proper readdir */ - if (whichop == GF_FOP_READDIR) - STACK_WIND (frame, dht_readdir_cbk, xvol, xvol->fops->readdir, - fd, size, xoff); - else + if (whichop == GF_FOP_READDIRP) { + if (dict) + local->xattr = dict_ref (dict); + else + local->xattr = dict_new (); + + if (local->xattr) { + ret = dict_set_uint32 (local->xattr, + conf->link_xattr_name, 256); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "failed to set '%s' key", + conf->link_xattr_name); + if (conf->readdir_optimize == _gf_true) { + if (xvol != local->first_up_subvol) { + ret = dict_set_int32 (local->xattr, + GF_READDIR_SKIP_DIRS, 1); + if (ret) + gf_log (this->name, + GF_LOG_ERROR, + "Dict set failed"); + } else { + dict_del (local->xattr, + GF_READDIR_SKIP_DIRS); + } + } + } + STACK_WIND (frame, dht_readdirp_cbk, xvol, xvol->fops->readdirp, - fd, size, xoff); + fd, size, xoff, local->xattr); + } else { + STACK_WIND (frame, dht_readdir_cbk, xvol, xvol->fops->readdir, + fd, size, xoff, local->xattr); + } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (readdir, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL); return 0; } @@ -2518,7 +3333,7 @@ err: int dht_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t yoff) + off_t yoff, dict_t *xdata) { int op = GF_FOP_READDIR; dht_conf_t *conf = NULL; @@ -2539,15 +3354,15 @@ dht_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, op = GF_FOP_READDIRP; out: - dht_do_readdir (frame, this, fd, size, yoff, op); + dht_do_readdir (frame, this, fd, size, yoff, op, 0); return 0; } int dht_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t yoff) + off_t yoff, dict_t *dict) { - dht_do_readdir (frame, this, fd, size, yoff, GF_FOP_READDIRP); + dht_do_readdir (frame, this, fd, size, yoff, GF_FOP_READDIRP, dict); return 0; } @@ -2555,7 +3370,7 @@ dht_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, int dht_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; @@ -2575,14 +3390,16 @@ dht_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (fsyncdir, frame, local->op_ret, local->op_errno); + DHT_STACK_UNWIND (fsyncdir, frame, local->op_ret, + local->op_errno, xdata); return 0; } int -dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync) +dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, + int datasync, dict_t *xdata) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; @@ -2609,14 +3426,14 @@ dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync) STACK_WIND (frame, dht_fsyncdir_cbk, conf->subvolumes[i], conf->subvolumes[i]->fops->fsyncdir, - fd, datasync); + fd, datasync, xdata); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fsyncdir, frame, -1, op_errno); + DHT_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); return 0; } @@ -2626,9 +3443,9 @@ int dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - call_frame_t *prev = NULL; + xlator_t *prev = NULL; int ret = -1; dht_local_t *local = NULL; @@ -2646,19 +3463,24 @@ dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, prev = cookie; if (local->loc.parent) { - WIPE (preparent); - WIPE (postparent); + + dht_inode_ctx_time_update (local->loc.parent, this, + preparent, 0); + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); } - ret = dht_layout_preset (this, prev->this, inode); + ret = dht_layout_preset (this, prev, inode); if (ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "could not set pre-set layout for subvolume %s", - prev->this->name); + prev? prev->name: NULL); op_ret = -1; op_errno = EINVAL; goto out; } + if (local->linked == _gf_true) + dht_linkfile_attr_heal (frame, this); out: /* * FIXME: ia_size and st_blocks of preparent and postparent do not have @@ -2667,10 +3489,9 @@ out: * corresponding values from each of the subvolume. * See dht_iatt_merge for reference. */ - DHT_STRIP_PHASE1_FLAGS (stbuf); - DHT_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, stbuf, preparent, - postparent); + DHT_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, stbuf, + preparent, postparent, xdata); return 0; } @@ -2679,7 +3500,8 @@ dht_mknod_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; xlator_t *cached_subvol = NULL; @@ -2688,22 +3510,28 @@ dht_mknod_linkfile_create_cbk (call_frame_t *frame, void *cookie, goto err; local = frame->local; + if (!local || !local->cached_subvol) { + op_errno = EINVAL; + goto err; + } + cached_subvol = local->cached_subvol; - STACK_WIND (frame, dht_newfile_cbk, - cached_subvol, cached_subvol->fops->mknod, - &local->loc, local->mode, local->rdev, - local->params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)cached_subvol, + cached_subvol, cached_subvol->fops->mknod, + &local->loc, local->mode, local->rdev, local->umask, + local->params); return 0; err: - DHT_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); return 0; } int dht_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, dict_t *params) + loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *params) { xlator_t *subvol = NULL; int op_errno = -1; @@ -2735,11 +3563,13 @@ dht_mknod (call_frame_t *frame, xlator_t *this, gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev, params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, + subvol, subvol->fops->mknod, loc, mode, + rdev, umask, params); } else { - avail_subvol = dht_free_disk_available_subvol (this, subvol); + + avail_subvol = dht_free_disk_available_subvol (this, subvol, + local); if (avail_subvol != subvol) { /* Choose the minimum filled volume, and create the files there */ @@ -2748,17 +3578,18 @@ dht_mknod (call_frame_t *frame, xlator_t *this, local->cached_subvol = avail_subvol; local->mode = mode; local->rdev = rdev; - + local->umask = umask; dht_linkfile_create (frame, dht_mknod_linkfile_create_cbk, - avail_subvol, subvol, loc); + this, avail_subvol, subvol, loc); } else { gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev, params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, + (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, + rdev, umask, params); } } @@ -2767,7 +3598,7 @@ dht_mknod (call_frame_t *frame, xlator_t *this, err: op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (mknod, frame, -1, op_errno, - NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); return 0; } @@ -2775,7 +3606,7 @@ err: int dht_symlink (call_frame_t *frame, xlator_t *this, - const char *linkname, loc_t *loc, dict_t *params) + const char *linkname, loc_t *loc, mode_t umask, dict_t *params) { xlator_t *subvol = NULL; int op_errno = -1; @@ -2803,23 +3634,24 @@ dht_symlink (call_frame_t *frame, xlator_t *this, gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->symlink, - linkname, loc, params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->symlink, linkname, loc, umask, + params); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (link, frame, -1, op_errno, - NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); return 0; } int -dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) +dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) { xlator_t *cached_subvol = NULL; xlator_t *hashed_subvol = NULL; @@ -2837,7 +3669,7 @@ dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) local->loc.path, cached_subvol->name, loc->path); STACK_WIND (frame, dht_unlink_cbk, cached_subvol, cached_subvol->fops->unlink, - &local->loc); + &local->loc, xflag, xdata); goto done; } @@ -2865,18 +3697,21 @@ dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) goto err; } + local->flags = xflag; if (hashed_subvol != cached_subvol) { STACK_WIND (frame, dht_unlink_linkfile_cbk, - hashed_subvol, hashed_subvol->fops->unlink, loc); + hashed_subvol, hashed_subvol->fops->unlink, loc, + xflag, xdata); } else { STACK_WIND (frame, dht_unlink_cbk, - cached_subvol, cached_subvol->fops->unlink, loc); + cached_subvol, cached_subvol->fops->unlink, loc, + xflag, xdata); } done: return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL); + DHT_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -2886,13 +3721,16 @@ int dht_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { call_frame_t *prev = NULL; dht_layout_t *layout = NULL; + dht_local_t *local = NULL; prev = cookie; + local = frame->local; + if (op_ret == -1) goto out; @@ -2906,13 +3744,20 @@ dht_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - WIPE (preparent); - WIPE (postparent); - + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, this, + preparent, 0); + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); + } + if (local->linked == _gf_true) { + local->stbuf = *stbuf; + dht_linkfile_attr_heal (frame, this); + } out: DHT_STRIP_PHASE1_FLAGS (stbuf); DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf, preparent, - postparent); + postparent, NULL); return 0; } @@ -2922,7 +3767,8 @@ int dht_link_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; xlator_t *srcvol = NULL; @@ -2934,14 +3780,14 @@ dht_link_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, srcvol = local->linkfile.srcvol; STACK_WIND (frame, dht_link_cbk, srcvol, srcvol->fops->link, - &local->loc, &local->loc2); + &local->loc, &local->loc2, xdata); return 0; err: DHT_STRIP_PHASE1_FLAGS (stbuf); DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf, preparent, - postparent); + postparent, NULL); return 0; } @@ -2949,7 +3795,7 @@ err: int dht_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) + loc_t *oldloc, loc_t *newloc, dict_t *xdata) { xlator_t *cached_subvol = NULL; xlator_t *hashed_subvol = NULL; @@ -2994,19 +3840,19 @@ dht_link (call_frame_t *frame, xlator_t *this, if (hashed_subvol != cached_subvol) { uuid_copy (local->gfid, oldloc->inode->gfid); - dht_linkfile_create (frame, dht_link_linkfile_cbk, + dht_linkfile_create (frame, dht_link_linkfile_cbk, this, cached_subvol, hashed_subvol, newloc); } else { STACK_WIND (frame, dht_link_cbk, cached_subvol, cached_subvol->fops->link, - oldloc, newloc); + oldloc, newloc, xdata); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); return 0; } @@ -3016,7 +3862,7 @@ int dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { call_frame_t *prev = NULL; int ret = -1; @@ -3035,8 +3881,11 @@ dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, prev = cookie; if (local->loc.parent) { - WIPE (preparent); - WIPE (postparent); + dht_inode_ctx_time_update (local->loc.parent, this, + preparent, 0); + + dht_inode_ctx_time_update (local->loc.parent, this, + postparent, 1); } ret = dht_layout_preset (this, prev->this, inode); @@ -3048,11 +3897,14 @@ dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, op_errno = EINVAL; goto out; } - + if (local->linked == _gf_true) { + local->stbuf = *stbuf; + dht_linkfile_attr_heal (frame, this); + } out: DHT_STRIP_PHASE1_FLAGS (stbuf); DHT_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, stbuf, preparent, - postparent); + postparent, NULL); return 0; } @@ -3062,7 +3914,8 @@ dht_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; xlator_t *cached_subvol = NULL; @@ -3076,18 +3929,19 @@ dht_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, STACK_WIND (frame, dht_create_cbk, cached_subvol, cached_subvol->fops->create, &local->loc, local->flags, local->mode, - local->fd, local->params); + local->umask, local->fd, local->params); return 0; err: - DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL, NULL); return 0; } int dht_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) + mode_t umask, fd_t *fd, dict_t *params) { int op_errno = -1; xlator_t *subvol = NULL; @@ -3113,7 +3967,7 @@ dht_create (call_frame_t *frame, xlator_t *this, local->loc.path, subvol->name, loc->path); STACK_WIND (frame, dht_create_cbk, subvol, subvol->fops->create, - &local->loc, flags, mode, fd, params); + &local->loc, flags, mode, umask, fd, params); goto done; } @@ -3131,38 +3985,38 @@ dht_create (call_frame_t *frame, xlator_t *this, "creating %s on %s", loc->path, subvol->name); STACK_WIND (frame, dht_create_cbk, subvol, subvol->fops->create, - loc, flags, mode, fd, params); + loc, flags, mode, umask, fd, params); goto done; } /* Choose the minimum filled volume, and create the files there */ - avail_subvol = dht_free_disk_available_subvol (this, subvol); + avail_subvol = dht_free_disk_available_subvol (this, subvol, local); if (avail_subvol != subvol) { local->params = dict_ref (params); local->flags = flags; local->mode = mode; - + local->umask = umask; local->cached_subvol = avail_subvol; local->hashed_subvol = subvol; gf_log (this->name, GF_LOG_TRACE, "creating %s on %s (link at %s)", loc->path, avail_subvol->name, subvol->name); - dht_linkfile_create (frame, - dht_create_linkfile_create_cbk, - avail_subvol, subvol, loc); + dht_linkfile_create (frame, dht_create_linkfile_create_cbk, + this, avail_subvol, subvol, loc); goto done; } gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); STACK_WIND (frame, dht_create_cbk, subvol, subvol->fops->create, - loc, flags, mode, fd, params); + loc, flags, mode, umask, fd, params); done: return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL, NULL); return 0; } @@ -3171,7 +4025,7 @@ err: int dht_mkdir_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { dht_local_t *local = NULL; dht_layout_t *layout = NULL; @@ -3182,14 +4036,17 @@ dht_mkdir_selfheal_cbk (call_frame_t *frame, void *cookie, if (op_ret == 0) { dht_layout_set (this, local->inode, layout); if (local->loc.parent) { - WIPE (&local->preparent); - WIPE (&local->postparent); + dht_inode_ctx_time_update (local->loc.parent, this, + &local->preparent, 0); + + dht_inode_ctx_time_update (local->loc.parent, this, + &local->postparent, 1); } } DHT_STACK_UNWIND (mkdir, frame, op_ret, op_errno, local->inode, &local->stbuf, &local->preparent, - &local->postparent); + &local->postparent, NULL); return 0; } @@ -3197,12 +4054,12 @@ dht_mkdir_selfheal_cbk (call_frame_t *frame, void *cookie, int dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; int ret = -1; - int subvol_filled = 0; + gf_boolean_t subvol_filled = _gf_false; call_frame_t *prev = NULL; dht_layout_t *layout = NULL; @@ -3218,6 +4075,15 @@ dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ret = dht_layout_merge (this, layout, prev->this, -1, ENOSPC, NULL); } else { + if (op_ret == -1 && op_errno == EEXIST) + /* Very likely just a race between mkdir and + self-heal (from lookup of a concurrent mkdir + attempt). + Ignore error for now. layout setting will + anyways fail if this was a different (old) + pre-existing different directory. + */ + op_ret = 0; ret = dht_layout_merge (this, layout, prev->this, op_ret, op_errno, NULL); } @@ -3250,7 +4116,8 @@ int dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; int ret = -1; @@ -3296,6 +4163,8 @@ dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie, local->call_cnt = conf->subvolume_cnt - 1; + if (uuid_is_null (local->loc.gfid)) + uuid_copy (local->loc.gfid, stbuf->ia_gfid); if (local->call_cnt == 0) { dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk, &local->loc, layout); @@ -3305,19 +4174,20 @@ dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie, continue; STACK_WIND (frame, dht_mkdir_cbk, conf->subvolumes[i], - conf->subvolumes[i]->fops->mkdir, - &local->loc, local->mode, local->params); + conf->subvolumes[i]->fops->mkdir, &local->loc, + local->mode, local->umask, local->params); } return 0; err: - DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL); return 0; } -int + int dht_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *params) + loc_t *loc, mode_t mode, mode_t umask, dict_t *params) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; @@ -3353,6 +4223,7 @@ dht_mkdir (call_frame_t *frame, xlator_t *this, local->hashed_subvol = hashed_subvol; local->mode = mode; + local->umask = umask; local->params = dict_ref (params); local->inode = inode_ref (loc->inode); @@ -3365,13 +4236,14 @@ dht_mkdir (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, dht_mkdir_hashed_cbk, hashed_subvol, hashed_subvol->fops->mkdir, - loc, mode, params); + loc, mode, umask, params); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL); return 0; } @@ -3379,14 +4251,87 @@ err: int dht_rmdir_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { dht_local_t *local = NULL; local = frame->local; DHT_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, NULL); + + return 0; +} + + +int +dht_rmdir_hashed_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + local->op_ret = -1; + if (op_errno != ENOENT && op_errno != EACCES) { + local->need_selfheal = 1; + } + + + gf_log (this->name, GF_LOG_DEBUG, + "rmdir on %s for %s failed (%s)", + prev->this->name, local->loc.path, + strerror (op_errno)); + goto unlock; + } + + dht_iatt_merge (this, &local->preparent, preparent, prev->this); + dht_iatt_merge (this, &local->postparent, postparent, + prev->this); + + } +unlock: + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + if (local->need_selfheal) { + local->layout = + dht_layout_get (this, local->loc.inode); + + /* TODO: neater interface needed below */ + local->stbuf.ia_type = local->loc.inode->ia_type; + + uuid_copy (local->gfid, local->loc.inode->gfid); + dht_selfheal_restore (frame, dht_rmdir_selfheal_cbk, + &local->loc, local->layout); + } else { + + if (local->loc.parent) { + dht_inode_ctx_time_update (local->loc.parent, + this, + &local->preparent, + 0); + + dht_inode_ctx_time_update (local->loc.parent, + this, + &local->postparent, + 1); + } + + DHT_STACK_UNWIND (rmdir, frame, local->op_ret, + local->op_errno, &local->preparent, + &local->postparent, NULL); + } + } return 0; } @@ -3395,11 +4340,12 @@ dht_rmdir_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int dht_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; call_frame_t *prev = NULL; + int done = 0; local = frame->local; prev = cookie; @@ -3410,8 +4356,9 @@ dht_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_errno = op_errno; local->op_ret = -1; - if (op_errno != ENOENT) + if (op_errno != ENOENT && op_errno != EACCES) { local->need_selfheal = 1; + } gf_log (this->name, GF_LOG_DEBUG, "rmdir on %s for %s failed (%s)", @@ -3420,6 +4367,8 @@ dht_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto unlock; } + /* Track if rmdir succeeded on atleast one subvol*/ + local->fop_succeeded = 1; dht_iatt_merge (this, &local->preparent, preparent, prev->this); dht_iatt_merge (this, &local->postparent, postparent, prev->this); @@ -3429,8 +4378,17 @@ unlock: this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - if (local->need_selfheal) { + + /* if local->hashed_subvol, we are yet to wind to hashed_subvol. */ + if (local->hashed_subvol && (this_call_cnt == 1)) { + done = 1; + } else if (!local->hashed_subvol && !this_call_cnt) { + done = 1; + } + + + if (done) { + if (local->need_selfheal && local->fop_succeeded) { local->layout = dht_layout_get (this, local->loc.inode); @@ -3440,15 +4398,34 @@ unlock: uuid_copy (local->gfid, local->loc.inode->gfid); dht_selfheal_restore (frame, dht_rmdir_selfheal_cbk, &local->loc, local->layout); - } else { + } else if (this_call_cnt) { + /* If non-hashed subvol's have responded, proceed */ + + local->need_selfheal = 0; + STACK_WIND (frame, dht_rmdir_hashed_subvol_cbk, + local->hashed_subvol, + local->hashed_subvol->fops->rmdir, + &local->loc, local->flags, NULL); + } else if (!this_call_cnt) { + /* All subvol's have responded, proceed */ + if (local->loc.parent) { - WIPE (&local->preparent); - WIPE (&local->postparent); + + dht_inode_ctx_time_update (local->loc.parent, + this, + &local->preparent, + 0); + + dht_inode_ctx_time_update (local->loc.parent, + this, + &local->postparent, + 1); + } DHT_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno, &local->preparent, - &local->postparent); + &local->postparent, NULL); } } @@ -3462,6 +4439,7 @@ dht_rmdir_do (call_frame_t *frame, xlator_t *this) dht_local_t *local = NULL; dht_conf_t *conf = NULL; int i = 0; + xlator_t *hashed_subvol = NULL; VALIDATE_OR_GOTO (this->private, err); @@ -3473,18 +4451,41 @@ dht_rmdir_do (call_frame_t *frame, xlator_t *this) local->call_cnt = conf->subvolume_cnt; + /* first remove from non-hashed_subvol */ + hashed_subvol = dht_subvol_get_hashed (this, &local->loc); + + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_WARNING, "failed to get hashed " + "subvol for %s",local->loc.path); + } else { + local->hashed_subvol = hashed_subvol; + } + + /* When DHT has only 1 child */ + if (conf->subvolume_cnt == 1) { + STACK_WIND (frame, dht_rmdir_hashed_subvol_cbk, + conf->subvolumes[0], + conf->subvolumes[0]->fops->rmdir, + &local->loc, local->flags, NULL); + return 0; + } + for (i = 0; i < conf->subvolume_cnt; i++) { + if (hashed_subvol && + (hashed_subvol == conf->subvolumes[i])) + continue; + STACK_WIND (frame, dht_rmdir_cbk, conf->subvolumes[i], conf->subvolumes[i]->fops->rmdir, - &local->loc, local->flags); + &local->loc, local->flags, NULL); } return 0; err: DHT_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, NULL); return 0; } @@ -3492,7 +4493,7 @@ err: int dht_rmdir_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -3540,6 +4541,7 @@ dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_frame_t *main_frame = NULL; dht_local_t *main_local = NULL; int this_call_cnt = 0; + dht_conf_t *conf = this->private; local = frame->local; prev = cookie; @@ -3551,7 +4553,7 @@ dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret != 0) goto err; - if (check_is_linkfile (inode, stbuf, xattr) == 0) { + if (!check_is_linkfile (inode, stbuf, xattr, conf->link_xattr_name)) { main_local->op_ret = -1; main_local->op_errno = ENOTEMPTY; @@ -3562,7 +4564,7 @@ dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } STACK_WIND (frame, dht_rmdir_linkfile_unlink_cbk, - src, src->fops->unlink, &local->loc); + src, src->fops->unlink, &local->loc, 0, NULL); return 0; err: @@ -3585,6 +4587,8 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, call_frame_t *lookup_frame = NULL; dht_local_t *lookup_local = NULL; dht_local_t *local = NULL; + dict_t *xattrs = NULL; + dht_conf_t *conf = this->private; local = frame->local; @@ -3593,7 +4597,8 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, continue; if (strcmp (trav->d_name, "..") == 0) continue; - if (check_is_linkfile (NULL, (&trav->d_stat), NULL) == 1) { + if (check_is_linkfile (NULL, (&trav->d_stat), trav->dict, + conf->link_xattr_name)) { ret++; continue; } @@ -3605,6 +4610,21 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, return 0; } + xattrs = dict_new (); + if (!xattrs) { + gf_log (this->name, GF_LOG_ERROR, "dict_new failed"); + return -1; + } + + ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to set linkto key" + " in dict"); + if (xattrs) + dict_unref (xattrs); + return -1; + } + list_for_each_entry (trav, &entries->list, list) { if (strcmp (trav->d_name, ".") == 0) continue; @@ -3621,8 +4641,7 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, goto err; } - lookup_local = GF_CALLOC (sizeof (*local), 1, - gf_dht_mt_dht_local_t); + lookup_local = mem_get0 (this->local_pool); if (!lookup_local) { goto err; } @@ -3635,6 +4654,8 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, if (build_ret != 0) goto err; + uuid_copy (lookup_local->loc.gfid, trav->d_stat.ia_gfid); + gf_log (this->name, GF_LOG_TRACE, "looking up %s on %s", lookup_local->loc.path, src->name); @@ -3647,12 +4668,18 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, STACK_WIND (lookup_frame, dht_rmdir_lookup_cbk, src, src->fops->lookup, - &lookup_local->loc, NULL); + &lookup_local->loc, xattrs); ret++; } + if (xattrs) + dict_unref (xattrs); + return ret; err: + if (xattrs) + dict_unref (xattrs); + DHT_STACK_DESTROY (lookup_frame); return 0; } @@ -3660,7 +4687,8 @@ err: int dht_rmdir_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *entries) + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = -1; @@ -3704,12 +4732,14 @@ dht_rmdir_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd) + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = -1; call_frame_t *prev = NULL; - + dict_t *dict = NULL; + int ret = 0; + dht_conf_t *conf = this->private; local = frame->local; prev = cookie; @@ -3719,14 +4749,32 @@ dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "opendir on %s for %s failed (%s)", prev->this->name, local->loc.path, strerror (op_errno)); + if (op_errno != ENOENT) { + local->op_ret = -1; + local->op_errno = op_errno; + } + goto err; + } + + dict = dict_new (); + if (!dict) { local->op_ret = -1; - local->op_errno = op_errno; + local->op_errno = ENOMEM; goto err; } + ret = dict_set_uint32 (dict, conf->link_xattr_name, 256); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set '%s' key", + local->loc.path, conf->link_xattr_name); + STACK_WIND (frame, dht_rmdir_readdirp_cbk, prev->this, prev->this->fops->readdirp, - local->fd, 4096, 0); + local->fd, 4096, 0, dict); + + if (dict) + dict_unref (dict); return 0; @@ -3742,7 +4790,8 @@ err: int -dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) +dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; @@ -3766,6 +4815,7 @@ dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) local->call_cnt = conf->subvolume_cnt; local->op_ret = 0; + local->fop_succeeded = 0; local->flags = flags; @@ -3780,7 +4830,7 @@ dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) STACK_WIND (frame, dht_rmdir_opendir_cbk, conf->subvolumes[i], conf->subvolumes[i]->fops->opendir, - loc, local->fd); + loc, local->fd, NULL); } return 0; @@ -3788,17 +4838,17 @@ dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) err: op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (rmdir, frame, -1, op_errno, - NULL, NULL); + NULL, NULL, NULL); return 0; } int dht_entrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - DHT_STACK_UNWIND (entrylk, frame, op_ret, op_errno); + DHT_STACK_UNWIND (entrylk, frame, op_ret, op_errno, xdata); return 0; } @@ -3806,7 +4856,7 @@ dht_entrylk_cbk (call_frame_t *frame, void *cookie, int dht_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -3816,7 +4866,6 @@ dht_entrylk (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); local = dht_local_init (frame, loc, NULL, GF_FOP_ENTRYLK); if (!local) { @@ -3836,13 +4885,13 @@ dht_entrylk (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, dht_entrylk_cbk, subvol, subvol->fops->entrylk, - volume, loc, basename, cmd, type); + volume, loc, basename, cmd, type, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (entrylk, frame, -1, op_errno); + DHT_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); return 0; } @@ -3850,10 +4899,10 @@ err: int dht_fentrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - DHT_STACK_UNWIND (fentrylk, frame, op_ret, op_errno); + DHT_STACK_UNWIND (fentrylk, frame, op_ret, op_errno, NULL); return 0; } @@ -3861,7 +4910,7 @@ dht_fentrylk_cbk (call_frame_t *frame, void *cookie, int dht_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type) + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -3880,13 +4929,13 @@ dht_fentrylk (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, dht_fentrylk_cbk, subvol, subvol->fops->fentrylk, - volume, fd, basename, cmd, type); + volume, fd, basename, cmd, type, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fentrylk, frame, -1, op_errno); + DHT_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); return 0; } @@ -3895,16 +4944,21 @@ err: int dht_forget (xlator_t *this, inode_t *inode) { - uint64_t tmp_layout = 0; + uint64_t ctx_int = 0; + dht_inode_ctx_t *ctx = NULL; dht_layout_t *layout = NULL; - inode_ctx_del (inode, this, &tmp_layout); + inode_ctx_del (inode, this, &ctx_int); - if (!tmp_layout) + if (!ctx_int) return 0; - layout = (dht_layout_t *)(long)tmp_layout; + ctx = (dht_inode_ctx_t *) (long) ctx_int; + + layout = ctx->layout; + ctx->layout = NULL; dht_layout_unref (this, layout); + GF_FREE (ctx); return 0; } @@ -3913,16 +4967,22 @@ dht_forget (xlator_t *this, inode_t *inode) int dht_notify (xlator_t *this, int event, void *data, ...) { - xlator_t *subvol = NULL; - int cnt = -1; - int i = -1; - dht_conf_t *conf = NULL; - int ret = -1; - int propagate = 0; + xlator_t *subvol = NULL; + int cnt = -1; + int i = -1; + dht_conf_t *conf = NULL; + int ret = -1; + int propagate = 0; + + int had_heard_from_all = 0; + int have_heard_from_all = 0; + struct timeval time = {0,}; + gf_defrag_info_t *defrag = NULL; + dict_t *dict = NULL; + gf_defrag_type cmd = 0; + dict_t *output = NULL; + va_list ap; - int had_heard_from_all = 0; - int have_heard_from_all = 0; - struct timeval time = {0,}; conf = this->private; if (!conf) @@ -3984,7 +5044,11 @@ dht_notify (xlator_t *this, int event, void *data, ...) if (conf->assert_no_child_down) { gf_log (this->name, GF_LOG_WARNING, "Received CHILD_DOWN. Exiting"); - exit(0); + if (conf->defrag) { + gf_defrag_stop (conf->defrag, NULL); + } else { + kill (getpid(), SIGTERM); + } } for (i = 0; i < conf->subvolume_cnt; i++) { @@ -4035,6 +5099,36 @@ dht_notify (xlator_t *this, int event, void *data, ...) UNLOCK (&conf->subvolume_lock); break; + case GF_EVENT_VOLUME_DEFRAG: + { + if (!conf->defrag) { + return ret; + } + defrag = conf->defrag; + + dict = data; + va_start (ap, data); + output = va_arg (ap, dict_t*); + + ret = dict_get_int32 (dict, "rebalance-command", + (int32_t*)&cmd); + if (ret) + return ret; + LOCK (&defrag->lock); + { + if (defrag->is_exiting) + goto unlock; + if (cmd == GF_DEFRAG_CMD_STATUS) + gf_defrag_status_get (defrag, output); + else if (cmd == GF_DEFRAG_CMD_STOP) + gf_defrag_stop (defrag, output); + } +unlock: + UNLOCK (&defrag->lock); + return 0; + break; + } + default: propagate = 1; break; @@ -4050,9 +5144,12 @@ dht_notify (xlator_t *this, int event, void *data, ...) /* if all subvols have reported status, no need to hide anything or wait for anything else. Just propagate blindly */ - if (have_heard_from_all) + if (have_heard_from_all) { propagate = 1; + } + + if (!had_heard_from_all && have_heard_from_all) { /* This is the first event which completes aggregation of events from all subvolumes. If at least one subvol @@ -4071,6 +5168,19 @@ dht_notify (xlator_t *this, int event, void *data, ...) /* continue to check other events for CHILD_UP */ } } + + /* rebalance is started with assert_no_child_down. So we do + * not need to handle CHILD_DOWN event here. + */ + if (conf->defrag) { + ret = gf_thread_create (&conf->defrag->th, NULL, + gf_defrag_start, this); + if (ret) { + conf->defrag = NULL; + GF_FREE (conf->defrag); + kill (getpid(), SIGTERM); + } + } } ret = 0; @@ -4079,3 +5189,24 @@ dht_notify (xlator_t *this, int event, void *data, ...) return ret; } + +int +dht_inode_ctx_layout_get (inode_t *inode, xlator_t *this, dht_layout_t **layout) +{ + dht_inode_ctx_t *ctx = NULL; + int ret = -1; + + ret = dht_inode_ctx_get (inode, this, &ctx); + + if (!ret && ctx) { + if (ctx->layout) { + if (layout) + *layout = ctx->layout; + ret = 0; + } else { + ret = -1; + } + } + + return ret; +} diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index d79ed9556..5ccd66799 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -22,6 +13,8 @@ #include "config.h" #endif +#include <regex.h> + #include "dht-mem-types.h" #include "libxlator.h" #include "syncop.h" @@ -29,7 +22,7 @@ #ifndef _DHT_H #define _DHT_H -#define GF_XATTR_FIX_LAYOUT_KEY "trusted.distribute.fix.layout" +#define GF_XATTR_FIX_LAYOUT_KEY "distribute.fix.layout" #define GF_DHT_LOOKUP_UNHASHED_ON 1 #define GF_DHT_LOOKUP_UNHASHED_AUTO 2 #define DHT_PATHINFO_HEADER "DISTRIBUTE:" @@ -38,7 +31,8 @@ typedef int (*dht_selfheal_dir_cbk_t) (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno); + int32_t op_ret, int32_t op_errno, + dict_t *xdata); typedef int (*dht_defrag_cbk_fn_t) (xlator_t *this, call_frame_t *frame, int ret); @@ -61,20 +55,38 @@ struct dht_layout { uint32_t start; uint32_t stop; xlator_t *xlator; - } list[0]; + } list[]; }; typedef struct dht_layout dht_layout_t; +struct dht_stat_time { + uint32_t atime; + uint32_t atime_nsec; + uint32_t ctime; + uint32_t ctime_nsec; + uint32_t mtime; + uint32_t mtime_nsec; +}; + +typedef struct dht_stat_time dht_stat_time_t; + +struct dht_inode_ctx { + dht_layout_t *layout; + dht_stat_time_t time; +}; + +typedef struct dht_inode_ctx dht_inode_ctx_t; + typedef enum { DHT_HASH_TYPE_DM, + DHT_HASH_TYPE_DM_USER, } dht_hashfn_type_t; /* rebalance related */ struct dht_rebalance_ { xlator_t *from_subvol; xlator_t *target_node; - int32_t wbflags; off_t offset; size_t size; int32_t flags; @@ -83,6 +95,7 @@ struct dht_rebalance_ { struct iovec *vector; struct iatt stbuf; dht_defrag_cbk_fn_t target_op_fn; + dict_t *xdata; }; struct dht_local { @@ -117,6 +130,7 @@ struct dht_local { int file_count; int dir_count; call_frame_t *main_frame; + int fop_succeeded; struct { fop_mknod_cbk_t linkfile_cbk; struct iatt stbuf; @@ -128,7 +142,6 @@ struct dht_local { struct { uint32_t hole_cnt; uint32_t overlaps_cnt; - uint32_t missing; uint32_t down; uint32_t misc; dht_selfheal_dir_cbk_t dir_cbk; @@ -141,11 +154,16 @@ struct dht_local { int32_t flags; mode_t mode; dev_t rdev; + mode_t umask; /* need for file-info */ - char *pathinfo; + char *xattr_val; char *key; + /* which xattr request? */ + char xsel[256]; + int32_t alloc_len; + char *newpath; /* gfid related */ @@ -161,18 +179,77 @@ struct dht_local { glusterfs_fop_t fop; + gf_boolean_t linked; + xlator_t *link_subvol; + struct dht_rebalance_ rebalance; + xlator_t *first_up_subvol; + }; typedef struct dht_local dht_local_t; /* du - disk-usage */ struct dht_du { double avail_percent; + double avail_inodes; uint64_t avail_space; uint32_t log; }; typedef struct dht_du dht_du_t; +enum gf_defrag_type { + GF_DEFRAG_CMD_START = 1, + GF_DEFRAG_CMD_STOP = 1 + 1, + GF_DEFRAG_CMD_STATUS = 1 + 2, + GF_DEFRAG_CMD_START_LAYOUT_FIX = 1 + 3, + GF_DEFRAG_CMD_START_FORCE = 1 + 4, +}; +typedef enum gf_defrag_type gf_defrag_type; + +enum gf_defrag_status_t { + GF_DEFRAG_STATUS_NOT_STARTED, + GF_DEFRAG_STATUS_STARTED, + GF_DEFRAG_STATUS_STOPPED, + GF_DEFRAG_STATUS_COMPLETE, + GF_DEFRAG_STATUS_FAILED, + GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED, + GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED, + GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE, + GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED, +}; +typedef enum gf_defrag_status_t gf_defrag_status_t; + +typedef struct gf_defrag_pattern_list gf_defrag_pattern_list_t; + +struct gf_defrag_pattern_list { + char path_pattern[256]; + uint64_t size; + gf_defrag_pattern_list_t *next; +}; + +struct gf_defrag_info_ { + uint64_t total_files; + uint64_t total_data; + uint64_t num_files_lookedup; + uint64_t total_failures; + uint64_t skipped; + gf_lock_t lock; + int cmd; + pthread_t th; + gf_defrag_status_t defrag_status; + struct rpc_clnt *rpc; + uint32_t connected; + uint32_t is_exiting; + pid_t pid; + inode_t *root_inode; + uuid_t node_uuid; + struct timeval start_time; + gf_boolean_t stats; + gf_defrag_pattern_list_t *defrag_pattern; +}; + +typedef struct gf_defrag_info_ gf_defrag_info_t; + struct dht_conf { gf_lock_t subvolume_lock; int subvolume_cnt; @@ -184,7 +261,8 @@ struct dht_conf { gf_boolean_t search_unhashed; int gen; dht_du_t *du_stats; - uint64_t min_free_disk; + double min_free_disk; + double min_free_inodes; char disk_unit; int32_t refresh_interval; gf_boolean_t unhashed_sticky_bit; @@ -201,10 +279,28 @@ struct dht_conf { /* Will be a global flag to control the layout spread count */ uint32_t dir_spread_cnt; - struct syncenv *env; /* The env pointer to the rebalance synctask */ - /* to keep track of nodes which are decomissioned */ xlator_t **decommissioned_bricks; + int decommission_in_progress; + int decommission_subvols_cnt; + + /* defrag related */ + gf_defrag_info_t *defrag; + + /* Request to filter directory entries in readdir request */ + + gf_boolean_t readdir_optimize; + + /* Support regex-based name reinterpretation. */ + regex_t rsync_regex; + gf_boolean_t rsync_regex_valid; + regex_t extra_regex; + gf_boolean_t extra_regex_valid; + + /* Support variable xattr names. */ + char *xattr_name; + char *link_xattr_name; + char *wild_xattr_name; }; typedef struct dht_conf dht_conf_t; @@ -219,29 +315,28 @@ struct dht_disk_layout { }; typedef struct dht_disk_layout dht_disk_layout_t; +typedef enum { + GF_DHT_MIGRATE_DATA, + GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS, + GF_DHT_MIGRATE_HARDLINK, + GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS +} gf_dht_migrate_data_type_t; #define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT) -#define is_fs_root(loc) (strcmp (loc->path, "/") == 0) - -#define is_revalidate(loc) (inode_ctx_get (loc->inode, this, NULL) == 0) +#define is_revalidate(loc) (dht_inode_ctx_layout_get (loc->inode, this, NULL) == 0) #define is_last_call(cnt) (cnt == 0) #define DHT_MIGRATION_IN_PROGRESS 1 #define DHT_MIGRATION_COMPLETED 2 -#define DHT_LINKFILE_KEY "trusted.glusterfs.dht.linkto" #define DHT_LINKFILE_MODE (S_ISVTX) -#define check_is_linkfile(i,s,x) ( \ +#define check_is_linkfile(i,s,x,n) ( \ ((st_mode_from_ia ((s)->ia_prot, (s)->ia_type) & ~S_IFMT) \ - == DHT_LINKFILE_MODE) && \ - dict_get (x, DHT_LINKFILE_KEY)) - -#define check_is_linkfile_wo_dict(i,s) ( \ - ((st_mode_from_ia ((s)->ia_prot, (s)->ia_type) & ~S_IFMT) \ - == DHT_LINKFILE_MODE)) + == DHT_LINKFILE_MODE) && \ + dict_get (x, n)) #define IS_DHT_MIGRATION_PHASE2(buf) ( \ IA_ISREG ((buf)->ia_type) && \ @@ -286,6 +381,25 @@ typedef struct dht_disk_layout dht_disk_layout_t; dht_local_wipe (__xl, __local); \ } while (0) +#define DHT_UPDATE_TIME(ctx_sec, ctx_nsec, new_sec, new_nsec, inode, post) do {\ + int32_t sec = 0; \ + sec = new_sec; \ + LOCK (&inode->lock); \ + { \ + new_sec = max(new_sec, ctx_sec); \ + if (sec < new_sec) \ + new_nsec = ctx_nsec; \ + if (sec == new_sec) \ + new_nsec = max (new_nsec, ctx_nsec); \ + if (post) { \ + ctx_sec = new_sec; \ + ctx_nsec = new_nsec; \ + } \ + } \ + UNLOCK (&inode->lock); \ + } while (0) + +#define is_greater_time(a, an, b, bn) (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))) dht_layout_t *dht_layout_new (xlator_t *this, int cnt); dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode); dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol); @@ -295,7 +409,7 @@ int dht_layout_normalize (xlator_t *this, l int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, uint32_t *holes_p, uint32_t *overlaps_p, uint32_t *missing_p, uint32_t *down_p, - uint32_t *misc_p); + uint32_t *misc_p, uint32_t *no_space_p); int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, loc_t *loc, dict_t *xattr); @@ -311,7 +425,7 @@ int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, int pos, int32_t **disk_layout_p); int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, - int pos, void *disk_layout_raw); + int pos, void *disk_layout_raw, int disk_layout_len); int dht_frame_return (call_frame_t *frame); @@ -329,12 +443,14 @@ int dht_iatt_merge (xlator_t *this, struct iatt *to, struct iatt xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc); xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode); xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev); +xlator_t *dht_subvol_next_available (xlator_t *this, xlator_t *prev); int dht_subvol_cnt (xlator_t *this, xlator_t *subvol); -int dht_hash_compute (int type, const char *name, uint32_t *hash_p); +int dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p); int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, - xlator_t *tovol, xlator_t *fromvol, loc_t *loc); + xlator_t *this, xlator_t *tovol, + xlator_t *fromvol, loc_t *loc); int dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc); int dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc); int @@ -349,17 +465,15 @@ dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, int dht_layout_sort_volname (dht_layout_t *layout); -int dht_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc); - int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc); -int dht_is_subvol_filled (xlator_t *this, xlator_t *subvol); -xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol); +gf_boolean_t dht_is_subvol_filled (xlator_t *this, xlator_t *subvol); +xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, + dht_local_t *layout); int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx); int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode); -int dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout); +int dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout);; void dht_layout_unref (xlator_t *this, dht_layout_t *layout); dht_layout_t *dht_layout_ref (xlator_t *this, dht_layout_t *layout); xlator_t *dht_first_up_subvol (xlator_t *this); @@ -374,7 +488,8 @@ int dht_rename_cleanup (call_frame_t *frame) int dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent); + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata); int dht_fix_directory_layout (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, @@ -397,73 +512,73 @@ int32_t dht_lookup (call_frame_t *frame, int32_t dht_stat (call_frame_t *frame, xlator_t *this, - loc_t *loc); + loc_t *loc, dict_t *xdata); int32_t dht_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd); + fd_t *fd, dict_t *xdata); int32_t dht_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, - off_t offset); + off_t offset, dict_t *xdata); int32_t dht_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, - off_t offset); + off_t offset, dict_t *xdata); int32_t dht_access (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t mask); + int32_t mask, dict_t *xdata); int32_t dht_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, - size_t size); + size_t size, dict_t *xdata); -int32_t dht_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, dict_t *params); +int32_t dht_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, + mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata); int32_t dht_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *params); + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata); int32_t dht_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc); + loc_t *loc, int xflag, dict_t *xdata); int32_t dht_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags); + loc_t *loc, int flags, dict_t *xdata); int32_t dht_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *loc, dict_t *params); + const char *linkpath, loc_t *loc, mode_t umask, + dict_t *xdata); int32_t dht_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc); + loc_t *newloc, dict_t *xdata); int32_t dht_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc); + loc_t *newloc, dict_t *xdata); int32_t dht_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params); + mode_t umask, fd_t *fd, dict_t *params); int32_t dht_open (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flags, fd_t *fd, - int32_t wbflags); + int32_t flags, fd_t *fd, dict_t *xdata); int32_t dht_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset); + off_t offset, uint32_t flags, dict_t *xdata); int32_t dht_writev (call_frame_t *frame, xlator_t *this, @@ -471,107 +586,121 @@ int32_t dht_writev (call_frame_t *frame, struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref); + uint32_t flags, + struct iobref *iobref, dict_t *xdata); int32_t dht_flush (call_frame_t *frame, xlator_t *this, - fd_t *fd); + fd_t *fd, dict_t *xdata); int32_t dht_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync); + int32_t datasync, dict_t *xdata); int32_t dht_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd); + loc_t *loc, fd_t *fd, dict_t *xdata); int32_t dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync); + int32_t datasync, dict_t *xdata); int32_t dht_statfs (call_frame_t *frame, xlator_t *this, - loc_t *loc); + loc_t *loc, dict_t *xdata); int32_t dht_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, - int32_t flags); + int32_t flags, dict_t *xdata); int32_t dht_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name); + const char *name, dict_t *xdata); int32_t dht_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, - int32_t flags); + int32_t flags, dict_t *xdata); int32_t dht_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - const char *name); + const char *name, dict_t *xdata); int32_t dht_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name); + const char *name, dict_t *xdata); +int32_t dht_fremovexattr (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + const char *name, dict_t *xdata); int32_t dht_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, - struct gf_flock *flock); + struct gf_flock *flock, dict_t *xdata); int32_t dht_inodelk (call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, int32_t cmd, - struct gf_flock *flock); + struct gf_flock *flock, dict_t *xdata); int32_t dht_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, int32_t cmd, - struct gf_flock *flock); + struct gf_flock *flock, dict_t *xdata); int32_t dht_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type); + entrylk_cmd cmd, entrylk_type type, dict_t *xdata); int32_t dht_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type); + entrylk_cmd cmd, entrylk_type type, dict_t *xdata); int32_t dht_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t off); + size_t size, off_t off, dict_t *xdata); int32_t dht_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t off); + size_t size, off_t off, dict_t *dict); int32_t dht_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, gf_xattrop_flags_t flags, - dict_t *dict); + dict_t *dict, dict_t *xdata); int32_t dht_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, gf_xattrop_flags_t flags, - dict_t *dict); + dict_t *dict, dict_t *xdata); int32_t dht_forget (xlator_t *this, inode_t *inode); int32_t dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid); + struct iatt *stbuf, int32_t valid, dict_t *xdata); int32_t dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid); - + struct iatt *stbuf, int32_t valid, dict_t *xdata); +int32_t dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t mode, off_t offset, size_t len, dict_t *xdata); +int32_t dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, size_t len, dict_t *xdata); +int32_t dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, size_t len, dict_t *xdata); + +int32_t dht_init (xlator_t *this); +void dht_fini (xlator_t *this); +int dht_reconfigure (xlator_t *this, dict_t *options); int32_t dht_notify (xlator_t *this, int32_t event, void *data, ...); /* definitions for nufa/switch */ @@ -594,12 +723,65 @@ int dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent); + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata); int dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent); + struct iatt *postparent, dict_t *xdata); + +int +gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict); + +int +gf_defrag_stop (gf_defrag_info_t *defrag, dict_t *output); +void* +gf_defrag_start (void *this); +int32_t +gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs, + struct iatt *stbuf); +int +dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, + int flag); +int +dht_inode_ctx_layout_get (inode_t *inode, xlator_t *this, + dht_layout_t **layout_int); +int +dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this, + dht_layout_t* layout_int); +int +dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat, + int32_t update_ctx); + +int dht_inode_ctx_get (inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx); +int dht_inode_ctx_set (inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx); +int +dht_dir_attr_heal (void *data); +int +dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data); +int +dht_dir_has_layout (dict_t *xattr, char *name); +gf_boolean_t +dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator); +xlator_t * +dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol, + dht_layout_t *layout); +xlator_t * +dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, + dht_layout_t *layout); +int +dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this); + +void +dht_layout_dump (dht_layout_t *layout, const char *prefix); +int32_t +dht_priv_dump (xlator_t *this); +int32_t +dht_inodectx_dump (xlator_t *this, inode_t *inode); + +int +dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol); #endif/* _DHT_H */ diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index 0b8c116ca..fe3955ecb 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -35,227 +26,389 @@ int dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct statvfs *statvfs) + int op_ret, int op_errno, struct statvfs *statvfs, + dict_t *xdata) { - dht_conf_t *conf = NULL; - call_frame_t *prev = NULL; - int this_call_cnt = 0; - int i = 0; - double percent = 0; - uint64_t bytes = 0; - - conf = this->private; - prev = cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "failed to get disk info from %s", prev->this->name); - goto out; - } - - if (statvfs && statvfs->f_blocks) { - percent = (statvfs->f_bavail * 100) / statvfs->f_blocks; - bytes = (statvfs->f_bavail * statvfs->f_frsize); - } - - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) - if (prev->this == conf->subvolumes[i]) { - conf->du_stats[i].avail_percent = percent; - conf->du_stats[i].avail_space = bytes; - gf_log (this->name, GF_LOG_TRACE, - "on subvolume '%s': avail_percent is: " - "%.2f and avail_space is: %"PRIu64"", - prev->this->name, - conf->du_stats[i].avail_percent, - conf->du_stats[i].avail_space); - } - } - UNLOCK (&conf->subvolume_lock); + dht_conf_t *conf = NULL; + call_frame_t *prev = NULL; + int this_call_cnt = 0; + int i = 0; + double percent = 0; + double percent_inodes = 0; + uint64_t bytes = 0; + + conf = this->private; + prev = cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "failed to get disk info from %s", prev->this->name); + goto out; + } + + if (statvfs && statvfs->f_blocks) { + percent = (statvfs->f_bavail * 100) / statvfs->f_blocks; + bytes = (statvfs->f_bavail * statvfs->f_frsize); + } + + if (statvfs && statvfs->f_files) { + percent_inodes = (statvfs->f_ffree * 100) / statvfs->f_files; + } else { + /* set percent inodes to 100 for dynamically allocated inode filesystems + this logic holds good so that, distribute has nothing to worry about + total inodes rather let the 'create()' to be scheduled on the hashed + subvol regardless of the total inodes. since we have no awareness on + loosing inodes this logic fits well + */ + percent_inodes = 100; + } + + LOCK (&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) + if (prev->this == conf->subvolumes[i]) { + conf->du_stats[i].avail_percent = percent; + conf->du_stats[i].avail_space = bytes; + conf->du_stats[i].avail_inodes = percent_inodes; + gf_log (this->name, GF_LOG_DEBUG, + "on subvolume '%s': avail_percent is: " + "%.2f and avail_space is: %"PRIu64" " + "and avail_inodes is: %.2f", + prev->this->name, + conf->du_stats[i].avail_percent, + conf->du_stats[i].avail_space, + conf->du_stats[i].avail_inodes); + } + } + UNLOCK (&conf->subvolume_lock); out: - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_DESTROY (frame); + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_DESTROY (frame); - return 0; + return 0; } int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx) { - dht_conf_t *conf = NULL; - call_frame_t *statfs_frame = NULL; - dht_local_t *statfs_local = NULL; - call_pool_t *pool = NULL; - - conf = this->private; - pool = this->ctx->pool; - - statfs_frame = create_frame (this, pool); - if (!statfs_frame) { - goto err; - } - - /* local->fop value is not used in this case */ - statfs_local = dht_local_init (statfs_frame, NULL, NULL, - GF_FOP_MAXVALUE); - if (!statfs_local) { - goto err; - } - - loc_t tmp_loc = { .inode = NULL, - .path = "/", - }; - - statfs_local->call_cnt = 1; - STACK_WIND (statfs_frame, dht_du_info_cbk, - conf->subvolumes[subvol_idx], - conf->subvolumes[subvol_idx]->fops->statfs, - &tmp_loc); - - return 0; + dht_conf_t *conf = NULL; + call_frame_t *statfs_frame = NULL; + dht_local_t *statfs_local = NULL; + call_pool_t *pool = NULL; + loc_t tmp_loc = {0,}; + + conf = this->private; + pool = this->ctx->pool; + + statfs_frame = create_frame (this, pool); + if (!statfs_frame) { + goto err; + } + + /* local->fop value is not used in this case */ + statfs_local = dht_local_init (statfs_frame, NULL, NULL, + GF_FOP_MAXVALUE); + if (!statfs_local) { + goto err; + } + + /* make it root gfid, should be enough to get the proper info back */ + tmp_loc.gfid[15] = 1; + + statfs_local->call_cnt = 1; + STACK_WIND (statfs_frame, dht_du_info_cbk, + conf->subvolumes[subvol_idx], + conf->subvolumes[subvol_idx]->fops->statfs, + &tmp_loc, NULL); + + return 0; err: - if (statfs_frame) - DHT_STACK_DESTROY (statfs_frame); + if (statfs_frame) + DHT_STACK_DESTROY (statfs_frame); - return -1; + return -1; } int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc) { - int i = 0; - dht_conf_t *conf = NULL; - call_frame_t *statfs_frame = NULL; - dht_local_t *statfs_local = NULL; - struct timeval tv = {0,}; + int i = 0; + dht_conf_t *conf = NULL; + call_frame_t *statfs_frame = NULL; + dht_local_t *statfs_local = NULL; + struct timeval tv = {0,}; + loc_t tmp_loc = {0,}; + + conf = this->private; + + gettimeofday (&tv, NULL); + + /* make it root gfid, should be enough to get the proper + info back */ + tmp_loc.gfid[15] = 1; + + if (tv.tv_sec > (conf->refresh_interval + + conf->last_stat_fetch.tv_sec)) { + + statfs_frame = copy_frame (frame); + if (!statfs_frame) { + goto err; + } + + /* In this case, 'local->fop' is not used */ + statfs_local = dht_local_init (statfs_frame, loc, NULL, + GF_FOP_MAXVALUE); + if (!statfs_local) { + goto err; + } + + statfs_local->call_cnt = conf->subvolume_cnt; + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (statfs_frame, dht_du_info_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->statfs, + &tmp_loc, NULL); + } + + conf->last_stat_fetch.tv_sec = tv.tv_sec; + } + return 0; +err: + if (statfs_frame) + DHT_STACK_DESTROY (statfs_frame); - conf = this->private; + return -1; +} - gettimeofday (&tv, NULL); - if (tv.tv_sec > (conf->refresh_interval - + conf->last_stat_fetch.tv_sec)) { - statfs_frame = copy_frame (frame); - if (!statfs_frame) { - goto err; - } +gf_boolean_t +dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) +{ + int i = 0; + dht_conf_t *conf = NULL; + gf_boolean_t subvol_filled_inodes = _gf_false; + gf_boolean_t subvol_filled_space = _gf_false; + gf_boolean_t is_subvol_filled = _gf_false; + + conf = this->private; + + /* Check for values above specified percent or free disk */ + LOCK (&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + if (conf->disk_unit == 'p') { + if (conf->du_stats[i].avail_percent < + conf->min_free_disk) { + subvol_filled_space = _gf_true; + break; + } + + } else { + if (conf->du_stats[i].avail_space < + conf->min_free_disk) { + subvol_filled_space = _gf_true; + break; + } + } + if (conf->du_stats[i].avail_inodes < + conf->min_free_inodes) { + subvol_filled_inodes = _gf_true; + break; + } + } + } + } + UNLOCK (&conf->subvolume_lock); + + if (subvol_filled_space && conf->subvolume_status[i]) { + if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) { + gf_log (this->name, GF_LOG_WARNING, + "disk space on subvolume '%s' is getting " + "full (%.2f %%), consider adding more nodes", + subvol->name, + (100 - conf->du_stats[i].avail_percent)); + } + } + + if (subvol_filled_inodes && conf->subvolume_status[i]) { + if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) { + gf_log (this->name, GF_LOG_CRITICAL, + "inodes on subvolume '%s' are at " + "(%.2f %%), consider adding more nodes", + subvol->name, + (100 - conf->du_stats[i].avail_inodes)); + } + } + + is_subvol_filled = (subvol_filled_space || subvol_filled_inodes); + + return is_subvol_filled; +} - /* In this case, 'local->fop' is not used */ - statfs_local = dht_local_init (statfs_frame, loc, NULL, - GF_FOP_MAXVALUE); - if (!statfs_local) { - goto err; - } - loc_t tmp_loc = { .inode = NULL, - .path = "/", - }; +/*Get the best subvolume to create the file in*/ +xlator_t * +dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, + dht_local_t *local) +{ + xlator_t *avail_subvol = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + loc_t *loc = NULL; - statfs_local->call_cnt = conf->subvolume_cnt; - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (statfs_frame, dht_du_info_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->statfs, - &tmp_loc); + conf = this->private; + if (!local) + goto out; + loc = &local->loc; + if (!local->layout) { + layout = dht_layout_get (this, loc->parent); + + if (!layout) { + gf_log (this->name, GF_LOG_DEBUG, + "layout missing path=%s parent=%s", + loc->path, uuid_utoa (loc->parent->gfid)); + goto out; } - - conf->last_stat_fetch.tv_sec = tv.tv_sec; + } else { + layout = dht_layout_ref (this, local->layout); } - return 0; -err: - if (statfs_frame) - DHT_STACK_DESTROY (statfs_frame); - return -1; + LOCK (&conf->subvolume_lock); + { + avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, + layout); + if(!avail_subvol) + { + avail_subvol = dht_subvol_maxspace_nonzeroinode(this, + subvol, + layout); + } + + } + UNLOCK (&conf->subvolume_lock); +out: + if (!avail_subvol) { + gf_log (this->name, + GF_LOG_DEBUG, + "no subvolume has enough free space and/or inodes\ + to create"); + avail_subvol = subvol; + } + + if (layout) + dht_layout_unref (this, layout); + return avail_subvol; } +static inline +int32_t dht_subvol_has_err (xlator_t *this, dht_layout_t *layout) +{ + int ret = -1; + int i = 0; -int -dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) + if (!this || !layout) + goto out; + + /* check if subvol has layout errors, before selecting it */ + for (i = 0; i < layout->cnt; i++) { + if (!strcmp (layout->list[i].xlator->name, this->name) && + (layout->list[i].err != 0)) { + ret = -1; + goto out; + } + } + ret = 0; +out: + return ret; +} + +/*Get subvolume which has both space and inodes more than the min criteria*/ +xlator_t * +dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, + dht_layout_t *layout) { - int i = 0; - int subvol_filled = 0; + int i = 0; + double max = 0; + double max_inodes = 0; + int ignore_subvol = 0; + + xlator_t *avail_subvol = NULL; dht_conf_t *conf = NULL; conf = this->private; - /* Check for values above specified percent or free disk */ - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (subvol == conf->subvolumes[i]) { - if (conf->disk_unit == 'p') { - if (conf->du_stats[i].avail_percent < - conf->min_free_disk) { - subvol_filled = 1; - break; - } - } else { - if (conf->du_stats[i].avail_space < - conf->min_free_disk) { - subvol_filled = 1; - break; - } - } + for(i=0; i < conf->subvolume_cnt; i++) { + /* check if subvol has layout errors, before selecting it */ + ignore_subvol = dht_subvol_has_err (conf->subvolumes[i], + layout); + if (ignore_subvol) + continue; + + if ((conf->disk_unit == 'p') && + (conf->du_stats[i].avail_percent > conf->min_free_disk) && + (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) { + if ((conf->du_stats[i].avail_inodes > max_inodes) || + (conf->du_stats[i].avail_percent > max)) { + max = conf->du_stats[i].avail_percent; + max_inodes = conf->du_stats[i].avail_inodes; + avail_subvol = conf->subvolumes[i]; } } - } - UNLOCK (&conf->subvolume_lock); - - if (subvol_filled && conf->subvolume_status[i]) { - if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) { - gf_log (this->name, GF_LOG_WARNING, - "disk space on subvolume '%s' is getting " - "full (%.2f %%), consider adding more nodes", - subvol->name, - (100 - conf->du_stats[i].avail_percent)); + + if ((conf->disk_unit != 'p') && + (conf->du_stats[i].avail_space > conf->min_free_disk) && + (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) { + if ((conf->du_stats[i].avail_inodes > max_inodes) || + (conf->du_stats[i].avail_space > max)) { + max = conf->du_stats[i].avail_space; + max_inodes = conf->du_stats[i].avail_inodes; + avail_subvol = conf->subvolumes[i]; + } } } - return subvol_filled; + return avail_subvol; } + +/* Get subvol which has atleast one inode and maximum space */ xlator_t * -dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol) +dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, + dht_layout_t *layout) { int i = 0; - double max= 0; + double max = 0; + int ignore_subvol = 0; + xlator_t *avail_subvol = NULL; dht_conf_t *conf = NULL; conf = this->private; - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->disk_unit == 'p') { - if (conf->du_stats[i].avail_percent > max) { - max = conf->du_stats[i].avail_percent; - avail_subvol = conf->subvolumes[i]; - } - } else { - if (conf->du_stats[i].avail_space > max) { - max = conf->du_stats[i].avail_space; - avail_subvol = conf->subvolumes[i]; - } + for (i = 0; i < conf->subvolume_cnt; i++) { + /* check if subvol has layout errors, before selecting it */ + ignore_subvol = dht_subvol_has_err (conf->subvolumes[i], + layout); + if (ignore_subvol) + continue; + + if (conf->disk_unit == 'p') { + if ((conf->du_stats[i].avail_percent > max) + && (conf->du_stats[i].avail_inodes > 0 )) { + max = conf->du_stats[i].avail_percent; + avail_subvol = conf->subvolumes[i]; } - } - } - UNLOCK (&conf->subvolume_lock); - - if (!avail_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume has enough free space to create"); + } else { + if ((conf->du_stats[i].avail_space > max) + && (conf->du_stats[i].avail_inodes > 0)) { + max = conf->du_stats[i].avail_space; + avail_subvol = conf->subvolumes[i]; + } + } } - if (max < conf->min_free_disk) - avail_subvol = subvol; - - if (!avail_subvol) - avail_subvol = subvol; - return avail_subvol; } diff --git a/xlators/cluster/dht/src/dht-hashfn.c b/xlators/cluster/dht/src/dht-hashfn.c index c8ae74172..656cf23a0 100644 --- a/xlators/cluster/dht/src/dht-hashfn.c +++ b/xlators/cluster/dht/src/dht-hashfn.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -37,6 +28,7 @@ dht_hash_compute_internal (int type, const char *name, uint32_t *hash_p) switch (type) { case DHT_HASH_TYPE_DM: + case DHT_HASH_TYPE_DM_USER: hash = gf_dm_hashfn (name, strlen (name)); break; default: @@ -52,30 +44,68 @@ dht_hash_compute_internal (int type, const char *name, uint32_t *hash_p) } -#define MAKE_RSYNC_FRIENDLY_NAME(rsync_frndly_name, name) do { \ - rsync_frndly_name = (char *) name; \ - if (name[0] == '.') { \ - char *dot = 0; \ - int namelen = 0; \ - \ - dot = strrchr (name, '.'); \ - if (dot && dot > (name + 1) && *(dot + 1)) { \ - namelen = (dot - name); \ - rsync_frndly_name = alloca (namelen); \ - strncpy (rsync_frndly_name, name + 1, \ - namelen); \ - rsync_frndly_name[namelen - 1] = 0; \ - } \ - } \ - } while (0); +static inline +gf_boolean_t +dht_munge_name (const char *original, char *modified, size_t len, regex_t *re) +{ + regmatch_t matches[2]; + size_t new_len; + + if (regexec(re,original,2,matches,0) != REG_NOMATCH) { + if (matches[1].rm_so != -1) { + new_len = matches[1].rm_eo - matches[1].rm_so; + /* Equal would fail due to the NUL at the end. */ + if (new_len < len) { + memcpy (modified,original+matches[1].rm_so, + new_len); + modified[new_len] = '\0'; + return _gf_true; + } + } + } + /* This is guaranteed safe because of how the dest was allocated. */ + strcpy(modified,original); + return _gf_false; +} int -dht_hash_compute (int type, const char *name, uint32_t *hash_p) +dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p) { - char *rsync_friendly_name = NULL; + char *rsync_friendly_name = NULL; + dht_conf_t *priv = this->private; + size_t len = 0; + gf_boolean_t munged = _gf_false; + + /* + * It wouldn't be safe to use alloca in an inline function that doesn't + * actually get inlined, and it wouldn't be efficient to do a real + * allocation, so we use alloca here (if needed) and pass that to the + * inline. + */ + + if (priv->extra_regex_valid) { + len = strlen(name) + 1; + rsync_friendly_name = alloca(len); + munged = dht_munge_name (name, rsync_friendly_name, len, + &priv->extra_regex); + } + + if (!munged && priv->rsync_regex_valid) { + len = strlen(name) + 1; + rsync_friendly_name = alloca(len); + gf_log (this->name, GF_LOG_TRACE, "trying regex for %s", name); + munged = dht_munge_name (name, rsync_friendly_name, len, + &priv->rsync_regex); + if (munged) { + gf_log (this->name, GF_LOG_DEBUG, + "munged down to %s", rsync_friendly_name); + } + } - MAKE_RSYNC_FRIENDLY_NAME (rsync_friendly_name, name); + if (!munged) { + rsync_friendly_name = (char *)name; + } return dht_hash_compute_internal (type, rsync_friendly_name, hash_p); } diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c index d8138067e..311a48112 100644 --- a/xlators/cluster/dht/src/dht-helper.c +++ b/xlators/cluster/dht/src/dht-helper.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -27,6 +18,28 @@ #include "xlator.h" #include "dht-common.h" +static inline int +dht_inode_ctx_set1 (xlator_t *this, inode_t *inode, xlator_t *subvol) +{ + uint64_t tmp_subvol = 0; + + tmp_subvol = (long)subvol; + return inode_ctx_set1 (inode, this, &tmp_subvol); +} + +int +dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol) +{ + int ret = -1; + uint64_t tmp_subvol = 0; + + ret = inode_ctx_get1 (inode, this, &tmp_subvol); + if (tmp_subvol && subvol) + *subvol = (xlator_t *)tmp_subvol; + + return ret; +} + int dht_frame_return (call_frame_t *frame) @@ -49,6 +62,43 @@ dht_frame_return (call_frame_t *frame) } +static uint64_t +dht_bits_for (uint64_t num) +{ + uint64_t bits = 0, ctrl = 1; + + while (ctrl < num) { + ctrl *= 2; + bits ++; + } + + return bits; +} + +/* + * A slightly "updated" version of the algorithm described in the commit log + * is used here. + * + * The only enhancement is that: + * + * - The number of bits used by the backend filesystem for HUGE d_off which + * is described as 63, and + * - The number of bits used by the d_off presented by the transformation + * upwards which is described as 64, are both made "configurable." + */ + + +#define BACKEND_D_OFF_BITS 63 +#define PRESENT_D_OFF_BITS 63 + +#define ONE 1ULL +#define MASK (~0ULL) +#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS)) +#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS)) + +#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1)) +#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1))) + int dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) { @@ -56,6 +106,9 @@ dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) int cnt = 0; int max = 0; uint64_t y = 0; + uint64_t hi_mask = 0; + uint64_t off_mask = 0; + int max_bits = 0; if (x == ((uint64_t) -1)) { y = (uint64_t) -1; @@ -69,7 +122,23 @@ dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) max = conf->subvolume_cnt; cnt = dht_subvol_cnt (this, subvol); - y = ((x * max) + cnt); + if (max == 1) { + y = x; + goto out; + } + + max_bits = dht_bits_for (max); + + hi_mask = ~(PRESENT_MASK >> (max_bits + 1)); + + if (x & hi_mask) { + /* HUGE d_off */ + off_mask = MASK << max_bits; + y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt; + } else { + /* small d_off */ + y = ((x * max) + cnt); + } out: if (y_p) @@ -89,7 +158,7 @@ dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc, int ret = 0; /* not found */ /* Why do other tasks if first required 'char' itself is not there */ - if (loc->name && !strchr (loc->name, '@')) + if (!new_loc || !loc || !loc->name || !strchr (loc->name, '@')) goto out; trav = this->children; @@ -117,7 +186,6 @@ dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc, new_loc->path = ((new_path) ? new_path: gf_strdup (loc->path)); new_loc->name = new_name; - new_loc->ino = loc->ino; new_loc->inode = inode_ref (loc->inode); new_loc->parent = inode_ref (loc->parent); } @@ -130,10 +198,8 @@ dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc, out: if (!ret) { /* !success */ - if (new_path) - GF_FREE (new_path); - if (new_name) - GF_FREE (new_name); + GF_FREE (new_path); + GF_FREE (new_name); } return ret; } @@ -147,16 +213,38 @@ dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p, int max = 0; uint64_t x = 0; xlator_t *subvol = 0; + int max_bits = 0; + uint64_t off_mask = 0; + uint64_t host_mask = 0; if (!this->private) - goto out; + return -1; conf = this->private; max = conf->subvolume_cnt; - cnt = y % max; - x = y / max; + if (max == 1) { + x = y; + cnt = 0; + goto out; + } + + if (y & TOP_BIT) { + /* HUGE d_off */ + max_bits = dht_bits_for (max); + off_mask = (MASK << max_bits); + host_mask = ~(off_mask); + + x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS; + + cnt = y & host_mask; + } else { + /* small d_off */ + cnt = y % max; + x = y / max; + } +out: subvol = conf->subvolumes[cnt]; if (subvol_p) @@ -165,7 +253,6 @@ dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p, if (x_p) *x_p = x; -out: return 0; } @@ -216,21 +303,16 @@ dht_local_wipe (xlator_t *this, dht_local_t *local) local->selfheal.layout = NULL; } - if (local->newpath) { - GF_FREE (local->newpath); - } + GF_FREE (local->newpath); - if (local->key) { - GF_FREE (local->key); - } + GF_FREE (local->key); - if (local->rebalance.vector) - GF_FREE (local->rebalance.vector); + GF_FREE (local->rebalance.vector); if (local->rebalance.iobref) iobref_unref (local->rebalance.iobref); - GF_FREE (local); + mem_put (local); } @@ -241,8 +323,7 @@ dht_local_init (call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop) inode_t *inode = NULL; int ret = 0; - /* TODO: use mem-pool */ - local = GF_CALLOC (1, sizeof (*local), gf_dht_mt_dht_local_t); + local = mem_get0 (THIS->local_pool); if (!local) goto out; @@ -275,26 +356,12 @@ dht_local_init (call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop) out: if (ret) { if (local) - GF_FREE (local); + mem_put (local); local = NULL; } return local; } - -char * -basestr (const char *str) -{ - char *basestr = NULL; - - basestr = strrchr (str, '/'); - if (basestr) - basestr ++; - - return basestr; -} - - xlator_t * dht_first_up_subvol (xlator_t *this) { @@ -359,17 +426,23 @@ dht_subvol_get_hashed (xlator_t *this, loc_t *loc) dht_layout_t *layout = NULL; xlator_t *subvol = NULL; - if (is_fs_root (loc)) { + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + + if (__is_root_gfid (loc->gfid)) { subvol = dht_first_up_subvol (this); goto out; } + GF_VALIDATE_OR_GOTO (this->name, loc->parent, out); + GF_VALIDATE_OR_GOTO (this->name, loc->name, out); + layout = dht_layout_get (this, loc->parent); if (!layout) { gf_log (this->name, GF_LOG_DEBUG, - "layout missing path=%s parent=%"PRId64, - loc->path, loc->parent->ino); + "layout missing path=%s parent=%s", + loc->path, uuid_utoa (loc->parent->gfid)); goto out; } @@ -397,6 +470,8 @@ dht_subvol_get_cached (xlator_t *this, inode_t *inode) dht_layout_t *layout = NULL; xlator_t *subvol = NULL; + GF_VALIDATE_OR_GOTO (this->name, this, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); layout = dht_layout_get (this, inode); @@ -438,7 +513,36 @@ out: return next; } +/* This func wraps around, if prev is actually the last subvol. + */ +xlator_t * +dht_subvol_next_available (xlator_t *this, xlator_t *prev) +{ + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *next = NULL; + + conf = this->private; + if (!conf) + goto out; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev) { + /* if prev is last in conf->subvolumes, then wrap + * around. + */ + if ((i + 1) < conf->subvolume_cnt) { + next = conf->subvolumes[i + 1]; + } else { + next = conf->subvolumes[0]; + } + break; + } + } + +out: + return next; +} int dht_subvol_cnt (xlator_t *this, xlator_t *subvol) { @@ -467,6 +571,15 @@ out: (a) = (b); \ } while (0) + +#define set_if_greater_time(a, an, b, bn) do { \ + if (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))){ \ + (a) = (b); \ + (an) = (bn); \ + } \ + } while (0) \ + + int dht_iatt_merge (xlator_t *this, struct iatt *to, struct iatt *from, xlator_t *subvol) @@ -490,9 +603,12 @@ dht_iatt_merge (xlator_t *this, struct iatt *to, set_if_greater (to->ia_uid, from->ia_uid); set_if_greater (to->ia_gid, from->ia_gid); - set_if_greater (to->ia_atime, from->ia_atime); - set_if_greater (to->ia_mtime, from->ia_mtime); - set_if_greater (to->ia_ctime, from->ia_ctime); + set_if_greater_time(to->ia_atime, to->ia_atime_nsec, + from->ia_atime, from->ia_atime_nsec); + set_if_greater_time (to->ia_mtime, to->ia_mtime_nsec, + from->ia_mtime, from->ia_mtime_nsec); + set_if_greater_time (to->ia_ctime, to->ia_ctime_nsec, + from->ia_ctime, from->ia_ctime_nsec); return 0; } @@ -618,20 +734,36 @@ dht_migration_complete_check_task (void *data) call_frame_t *frame = NULL; loc_t tmp_loc = {0,}; char *path = NULL; + dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + uint64_t tmp_subvol = 0; + int open_failed = 0; this = THIS; frame = data; local = frame->local; + conf = this->private; src_node = local->cached_subvol; - /* getxattr on cached_subvol for 'linkto' value */ - if (!local->loc.inode) + if (!local->loc.inode && !local->fd) + goto out; + + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr + * as root:root. If a fd is already open, access check wont be done*/ + + if (!local->loc.inode) { ret = syncop_fgetxattr (src_node, local->fd, &dict, - DHT_LINKFILE_KEY); - else + conf->link_xattr_name); + } else { + SYNCTASK_SETID (0, 0); ret = syncop_getxattr (src_node, &local->loc, &dict, - DHT_LINKFILE_KEY); + conf->link_xattr_name); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + } if (!ret) dst_node = dht_linkfile_subvol (this, NULL, NULL, dict); @@ -682,10 +814,7 @@ dht_migration_complete_check_task (void *data) /* update inode ctx (the layout) */ dht_layout_unref (this, local->layout); - if (!local->loc.inode) - ret = dht_layout_preset (this, dst_node, local->fd->inode); - else - ret = dht_layout_preset (this, dst_node, local->loc.inode); + ret = dht_layout_preset (this, dst_node, inode); if (ret != 0) { gf_log (this->name, GF_LOG_DEBUG, "%s: could not set preset layout for subvol %s", @@ -703,10 +832,7 @@ dht_migration_complete_check_task (void *data) goto out; } - if (!local->loc.inode) - ret = dht_layout_set (this, local->fd->inode, layout); - else - ret = dht_layout_set (this, local->loc.inode, layout); + ret = dht_layout_set (this, inode, layout); if (ret) { gf_log (this->name, GF_LOG_ERROR, "%s: failed to set the new layout", @@ -717,43 +843,46 @@ dht_migration_complete_check_task (void *data) local->cached_subvol = dst_node; ret = 0; - if (!local->fd) + /* once we detect the migration complete, the inode-ctx2 is no more + required.. delete the ctx and also, it means, open() already + done on all the fd of inode */ + ret = inode_ctx_reset1 (inode, this, &tmp_subvol); + if (tmp_subvol) goto out; - /* once we detect the migration complete, the fd-ctx is no more - required.. delete the ctx, and do one extra 'fd_unref' for open fd */ - ret = fd_ctx_del (local->fd, this, NULL); - if (!ret) { - fd_unref (local->fd); - ret = 0; + if (list_empty (&inode->fd_list)) goto out; - } - /* if 'local->fd' (ie, fd based operation), send a 'open()' on - destination if not already done */ - if (local->loc.inode) { - ret = syncop_open (dst_node, &local->loc, - local->fd->flags, local->fd); - } else { - tmp_loc.inode = local->fd->inode; - inode_path (local->fd->inode, NULL, &path); - if (path) - tmp_loc.path = path; - ret = syncop_open (dst_node, &tmp_loc, - local->fd->flags, local->fd); - if (path) - GF_FREE (path); + /* perform open as root:root. There is window between linkfile + * creation(root:root) and setattr with the correct uid/gid + */ + SYNCTASK_SETID(0, 0); + + /* perform 'open()' on all the fd's present on the inode */ + tmp_loc.inode = inode; + inode_path (inode, NULL, &path); + if (path) + tmp_loc.path = path; + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + if (fd_is_anonymous (iter_fd)) + continue; + ret = syncop_open (dst_node, &tmp_loc, + iter_fd->flags, iter_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "failed to open " + "the fd (%p, flags=0%o) on file %s @ %s", + iter_fd, iter_fd->flags, path, dst_node->name); + open_failed = 1; + } } - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to send open() on target file at %s", - local->loc.path, dst_node->name); + GF_FREE (path); + + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + if (open_failed) { + ret = -1; goto out; } - - /* need this unref for the fd on src_node */ - fd_unref (local->fd); ret = 0; out: @@ -764,11 +893,8 @@ int dht_rebalance_complete_check (xlator_t *this, call_frame_t *frame) { int ret = -1; - dht_conf_t *conf = NULL; - - conf = this->private; - ret = synctask_new (conf->env, dht_migration_complete_check_task, + ret = synctask_new (this->ctx->env, dht_migration_complete_check_task, dht_migration_complete_check_done, frame, frame); return ret; @@ -800,20 +926,34 @@ dht_rebalance_inprogress_task (void *data) char *path = NULL; struct iatt stbuf = {0,}; loc_t tmp_loc = {0,}; + dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + int open_failed = 0; this = THIS; frame = data; local = frame->local; + conf = this->private; src_node = local->cached_subvol; - /* getxattr on cached_subvol for 'linkto' value */ - if (local->loc.inode) + if (!local->loc.inode && !local->fd) + goto out; + + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr + * as root:root. If a fd is already open, access check wont be done*/ + if (local->loc.inode) { + SYNCTASK_SETID (0, 0); ret = syncop_getxattr (src_node, &local->loc, &dict, - DHT_LINKFILE_KEY); - else + conf->link_xattr_name); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + } else { ret = syncop_fgetxattr (src_node, local->fd, &dict, - DHT_LINKFILE_KEY); + conf->link_xattr_name); + } if (ret) { gf_log (this->name, GF_LOG_ERROR, @@ -855,34 +995,46 @@ dht_rebalance_inprogress_task (void *data) ret = 0; - if (!local->fd) - goto out; + if (list_empty (&inode->fd_list)) + goto done; + + /* perform open as root:root. There is window between linkfile + * creation(root:root) and setattr with the correct uid/gid + */ + SYNCTASK_SETID (0, 0); + + tmp_loc.inode = inode; + inode_path (inode, NULL, &path); + if (path) + tmp_loc.path = path; + + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + if (fd_is_anonymous (iter_fd)) + continue; - if (local->loc.inode) { - ret = syncop_open (dst_node, &local->loc, - local->fd->flags, local->fd); - } else { - tmp_loc.inode = local->fd->inode; - inode_path (local->fd->inode, NULL, &path); - if (path) - tmp_loc.path = path; ret = syncop_open (dst_node, &tmp_loc, - local->fd->flags, local->fd); - if (path) - GF_FREE (path); + iter_fd->flags, iter_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "failed to send open " + "the fd (%p, flags=0%o) on file %s @ %s", + iter_fd, iter_fd->flags, path, dst_node->name); + open_failed = 1; + } } + GF_FREE (path); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to send open() on target file at %s", - local->loc.path, dst_node->name); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + + if (open_failed) { + ret = -1; goto out; } - ret = fd_ctx_set (local->fd, this, (uint64_t)(long)dst_node); +done: + ret = dht_inode_ctx_set1 (this, inode, dst_node); if (ret) { gf_log (this->name, GF_LOG_ERROR, - "%s: failed to set fd-ctx target file at %s", + "%s: failed to set inode-ctx target file at %s", local->loc.path, dst_node->name); goto out; } @@ -897,12 +1049,99 @@ dht_rebalance_in_progress_check (xlator_t *this, call_frame_t *frame) { int ret = -1; - dht_conf_t *conf = NULL; - conf = this->private; - - ret = synctask_new (conf->env, dht_rebalance_inprogress_task, + ret = synctask_new (this->ctx->env, dht_rebalance_inprogress_task, dht_inprogress_check_done, frame, frame); return ret; } + +int +dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this, + dht_layout_t *layout_int) +{ + dht_inode_ctx_t *ctx = NULL; + int ret = -1; + + ret = dht_inode_ctx_get (inode, this, &ctx); + if (!ret && ctx) { + ctx->layout = layout_int; + } else { + ctx = GF_CALLOC (1, sizeof (*ctx), gf_dht_mt_inode_ctx_t); + if (!ctx) + return ret; + ctx->layout = layout_int; + } + + ret = dht_inode_ctx_set (inode, this, ctx); + + return ret; +} + +int +dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat, + int32_t post) +{ + dht_inode_ctx_t *ctx = NULL; + dht_stat_time_t *time = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO (this->name, stat, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + + ret = dht_inode_ctx_get (inode, this, &ctx); + + if (ret) { + ctx = GF_CALLOC (1, sizeof (*ctx), gf_dht_mt_inode_ctx_t); + if (!ctx) + return -1; + } + + time = &ctx->time; + + DHT_UPDATE_TIME(time->mtime, time->mtime_nsec, + stat->ia_mtime, stat->ia_mtime_nsec, inode, post); + DHT_UPDATE_TIME(time->ctime, time->ctime_nsec, + stat->ia_ctime, stat->ia_ctime_nsec, inode, post); + DHT_UPDATE_TIME(time->atime, time->atime_nsec, + stat->ia_atime, stat->ia_atime_nsec, inode, post); + + ret = dht_inode_ctx_set (inode, this, ctx); +out: + return 0; +} + +int +dht_inode_ctx_get (inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx) +{ + int ret = -1; + uint64_t ctx_int = 0; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + + ret = inode_ctx_get (inode, this, &ctx_int); + + if (ret) + return ret; + + if (ctx) + *ctx = (dht_inode_ctx_t *) ctx_int; +out: + return ret; +} + +int dht_inode_ctx_set (inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx) +{ + int ret = -1; + uint64_t ctx_int = 0; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO (this->name, ctx, out); + + ctx_int = (long)ctx; + ret = inode_ctx_set (inode, this, &ctx_int); +out: + return ret; +} diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c index d6a2cf369..ece84151a 100644 --- a/xlators/cluster/dht/src/dht-inode-read.c +++ b/xlators/cluster/dht/src/dht-inode-read.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -34,7 +25,7 @@ int dht_fsync2 (xlator_t *this, call_frame_t *frame, int ret); int dht_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd) + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -61,7 +52,7 @@ dht_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, return 0; out: - DHT_STACK_UNWIND (open, frame, op_ret, op_errno, local->fd); + DHT_STACK_UNWIND (open, frame, op_ret, op_errno, local->fd, xdata); return 0; } @@ -86,17 +77,17 @@ dht_open2 (xlator_t *this, call_frame_t *frame, int op_ret) STACK_WIND (frame, dht_open_cbk, subvol, subvol->fops->open, &local->loc, local->rebalance.flags, local->fd, - local->rebalance.wbflags); + NULL); return 0; out: - DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); return 0; } int dht_open (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags, fd_t *fd, int wbflags) + loc_t *loc, int flags, fd_t *fd, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -120,30 +111,30 @@ dht_open (call_frame_t *frame, xlator_t *this, goto err; } - local->rebalance.wbflags = wbflags; local->rebalance.flags = flags; local->call_cnt = 1; STACK_WIND (frame, dht_open_cbk, subvol, subvol->fops->open, - loc, flags, fd, wbflags); + loc, flags, fd, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (open, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (open, frame, -1, op_errno, NULL, NULL); return 0; } int dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *stbuf) + int op_ret, int op_errno, struct iatt *stbuf, dict_t *xdata) { - uint64_t tmp_subvol = 0; + xlator_t *subvol = 0; dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + inode_t *inode = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -164,26 +155,28 @@ dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (local->call_cnt != 1) goto out; + local->op_errno = op_errno; /* Check if the rebalance phase2 is true */ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) { - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (ret) { + inode = (local->fd) ? local->fd->inode : local->loc.inode; + ret = dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { /* Phase 2 of migration */ local->rebalance.target_op_fn = dht_attr2; ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; } else { /* value is already set in fd_ctx, that means no need to check for whether its complete or not. */ dht_attr2 (this, frame, 0); - } - if (!ret) return 0; + } } out: DHT_STRIP_PHASE1_FLAGS (stbuf); - DHT_STACK_UNWIND (stat, frame, op_ret, op_errno, stbuf); + DHT_STACK_UNWIND (stat, frame, op_ret, op_errno, stbuf, xdata); err: return 0; } @@ -208,21 +201,21 @@ dht_attr2 (xlator_t *this, call_frame_t *frame, int op_ret) if (local->fop == GF_FOP_FSTAT) { STACK_WIND (frame, dht_file_attr_cbk, subvol, - subvol->fops->fstat, local->fd); + subvol->fops->fstat, local->fd, NULL); } else { STACK_WIND (frame, dht_file_attr_cbk, subvol, - subvol->fops->stat, &local->loc); + subvol->fops->stat, &local->loc, NULL); } return 0; out: - DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); return 0; } int dht_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *stbuf) + int op_ret, int op_errno, struct iatt *stbuf, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; @@ -257,20 +250,21 @@ out: this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) { DHT_STACK_UNWIND (stat, frame, local->op_ret, local->op_errno, - &local->stbuf); + &local->stbuf, xdata); } err: return 0; } int -dht_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) +dht_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; dht_local_t *local = NULL; dht_layout_t *layout = NULL; int i = 0; + int call_cnt = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -299,39 +293,40 @@ dht_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) subvol = local->cached_subvol; STACK_WIND (frame, dht_file_attr_cbk, subvol, - subvol->fops->stat, loc); + subvol->fops->stat, loc, xdata); return 0; } - local->call_cnt = layout->cnt; + local->call_cnt = call_cnt = layout->cnt; - for (i = 0; i < layout->cnt; i++) { + for (i = 0; i < call_cnt; i++) { subvol = layout->list[i].xlator; STACK_WIND (frame, dht_attr_cbk, subvol, subvol->fops->stat, - loc); + loc, xdata); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); return 0; } int -dht_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd) +dht_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; dht_local_t *local = NULL; dht_layout_t *layout = NULL; int i = 0; + int call_cnt = 0; VALIDATE_OR_GOTO (frame, err); @@ -358,25 +353,25 @@ dht_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd) subvol = local->cached_subvol; STACK_WIND (frame, dht_file_attr_cbk, subvol, - subvol->fops->fstat, fd); + subvol->fops->fstat, fd, xdata); return 0; } - local->call_cnt = layout->cnt; + local->call_cnt = call_cnt = layout->cnt; - for (i = 0; i < layout->cnt; i++) { + for (i = 0; i < call_cnt; i++) { subvol = layout->list[i].xlator; STACK_WIND (frame, dht_attr_cbk, subvol, subvol->fops->fstat, - fd); + fd, xdata); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fstat, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); return 0; } @@ -385,10 +380,12 @@ int dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iovec *vector, int count, struct iatt *stbuf, - struct iobref *iobref) + struct iobref *iobref, dict_t *xdata) { dht_local_t *local = NULL; int ret = 0; + inode_t *inode = NULL; + xlator_t *subvol = 0; local = frame->local; if (!local) { @@ -404,25 +401,27 @@ dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if ((op_ret == -1) && (op_errno != ENOENT)) goto out; + local->op_errno = op_errno; if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) { /* File would be migrated to other node */ - ret = fd_ctx_get (local->fd, this, NULL); - if (ret) { + ret = dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { local->rebalance.target_op_fn = dht_readv2; ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; } else { /* value is already set in fd_ctx, that means no need to check for whether its complete or not. */ dht_readv2 (this, frame, 0); - } - if (!ret) return 0; + } } out: DHT_STRIP_PHASE1_FLAGS (stbuf); DHT_STACK_UNWIND (readv, frame, op_ret, op_errno, vector, count, stbuf, - iobref); + iobref, xdata); return 0; } @@ -446,18 +445,19 @@ dht_readv2 (xlator_t *this, call_frame_t *frame, int op_ret) subvol = local->cached_subvol; STACK_WIND (frame, dht_readv_cbk, subvol, subvol->fops->readv, - local->fd, local->rebalance.size, local->rebalance.offset); + local->fd, local->rebalance.size, local->rebalance.offset, + local->rebalance.flags, NULL); return 0; out: - DHT_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL); + DHT_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); return 0; } int dht_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off) + fd_t *fd, size_t size, off_t off, uint32_t flags, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -483,35 +483,57 @@ dht_readv (call_frame_t *frame, xlator_t *this, local->rebalance.offset = off; local->rebalance.size = size; + local->rebalance.flags = flags; local->call_cnt = 1; STACK_WIND (frame, dht_readv_cbk, subvol, subvol->fops->readv, - fd, size, off); + fd, size, off, flags, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL); + DHT_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); return 0; } int dht_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { int ret = -1; dht_local_t *local = NULL; + xlator_t *subvol = NULL; + call_frame_t *prev = NULL; local = frame->local; + prev = cookie; + if (!prev || !prev->this) + goto out; if (local->call_cnt != 1) goto out; + if ((op_ret == -1) && (op_errno == ENOTCONN) && + IA_ISDIR(local->loc.inode->ia_type)) { + + subvol = dht_subvol_next_available (this, prev->this); + if (!subvol) + goto out; + /* check if we are done with visiting every node */ + if (subvol == local->cached_subvol) { + goto out; + } + + STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access, + &local->loc, local->rebalance.flags, NULL); + return 0; + } if ((op_ret == -1) && (op_errno == ENOENT)) { /* File would be migrated to other node */ + local->op_errno = op_errno; local->rebalance.target_op_fn = dht_access2; ret = dht_rebalance_complete_check (frame->this, frame); if (!ret) @@ -519,7 +541,7 @@ dht_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } out: - DHT_STACK_UNWIND (access, frame, op_ret, op_errno); + DHT_STACK_UNWIND (access, frame, op_ret, op_errno, xdata); return 0; } @@ -542,18 +564,19 @@ dht_access2 (xlator_t *this, call_frame_t *frame, int op_ret) subvol = local->cached_subvol; STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access, - &local->loc, local->rebalance.flags); + &local->loc, local->rebalance.flags, NULL); return 0; out: - DHT_STACK_UNWIND (access, frame, -1, op_errno); + DHT_STACK_UNWIND (access, frame, -1, op_errno, NULL); return 0; } int -dht_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask) +dht_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -582,13 +605,13 @@ dht_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask) } STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access, - loc, mask); + loc, mask, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (access, frame, -1, op_errno); + DHT_STACK_UNWIND (access, frame, -1, op_errno, NULL); return 0; } @@ -596,10 +619,11 @@ err: int dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - int ret = -1; + dht_local_t *local = NULL; + inode_t *inode = NULL; + xlator_t *subvol = 0; local = frame->local; @@ -609,14 +633,14 @@ dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; /* If context is set, then send flush() it to the destination */ - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret) { + dht_inode_ctx_get1 (this, inode, &subvol); + if (subvol) { dht_flush2 (this, frame, 0); return 0; } out: - DHT_STACK_UNWIND (flush, frame, op_ret, op_errno); + DHT_STACK_UNWIND (flush, frame, op_ret, op_errno, xdata); return 0; } @@ -626,14 +650,10 @@ dht_flush2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -641,20 +661,19 @@ dht_flush2 (xlator_t *this, call_frame_t *frame, int op_ret) local->call_cnt = 2; /* This is the second attempt */ STACK_WIND (frame, dht_flush_cbk, - subvol, subvol->fops->flush, local->fd); + subvol, subvol->fops->flush, local->fd, NULL); return 0; } int -dht_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +dht_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; dht_local_t *local = NULL; - VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); @@ -676,13 +695,13 @@ dht_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) local->call_cnt = 1; STACK_WIND (frame, dht_flush_cbk, - subvol, subvol->fops->flush, fd); + subvol, subvol->fops->flush, fd, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (flush, frame, -1, op_errno); + DHT_STACK_UNWIND (flush, frame, -1, op_errno, NULL); return 0; } @@ -690,17 +709,20 @@ err: int dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, - int op_errno, struct iatt *prebuf, struct iatt *postbuf) + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + inode_t *inode = NULL; + xlator_t *subvol = 0; local = frame->local; prev = cookie; local->op_errno = op_errno; - if (op_ret == -1) { + if (op_ret == -1 && (op_errno != ENOENT)) { gf_log (this->name, GF_LOG_DEBUG, "subvolume %s returned -1 (%s)", prev->this->name, strerror (op_errno)); @@ -715,8 +737,9 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, goto out; } - ret = fd_ctx_get (local->fd, this, NULL); - if (ret) { + local->op_errno = op_errno; + dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { local->rebalance.target_op_fn = dht_fsync2; /* Check if the rebalance phase1 is true */ @@ -731,17 +754,18 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, if (IS_DHT_MIGRATION_PHASE2 (postbuf)) { ret = dht_rebalance_complete_check (this, frame); } + if (!ret) + return 0; } else { dht_fsync2 (this, frame, 0); - } - if (!ret) return 0; + } out: DHT_STRIP_PHASE1_FLAGS (postbuf); DHT_STRIP_PHASE1_FLAGS (prebuf); DHT_STACK_UNWIND (fsync, frame, op_ret, op_errno, - prebuf, postbuf); + prebuf, postbuf, xdata); return 0; } @@ -751,34 +775,29 @@ dht_fsync2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; - + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; local->call_cnt = 2; /* This is the second attempt */ STACK_WIND (frame, dht_fsync_cbk, subvol, subvol->fops->fsync, - local->fd, local->rebalance.flags); + local->fd, local->rebalance.flags, NULL); return 0; } int -dht_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync) +dht_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync, + dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; dht_local_t *local = NULL; - VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); @@ -796,13 +815,13 @@ dht_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync) subvol = local->cached_subvol; STACK_WIND (frame, dht_fsync_cbk, subvol, subvol->fops->fsync, - fd, datasync); + fd, datasync, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL); + DHT_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -813,9 +832,9 @@ err: phase 2 of migration */ int dht_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct gf_flock *flock) + int op_ret, int op_errno, struct gf_flock *flock, dict_t *xdata) { - DHT_STACK_UNWIND (lk, frame, op_ret, op_errno, flock); + DHT_STACK_UNWIND (lk, frame, op_ret, op_errno, flock, xdata); return 0; } @@ -823,7 +842,7 @@ dht_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int dht_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int cmd, struct gf_flock *flock) + fd_t *fd, int cmd, struct gf_flock *flock, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -843,13 +862,13 @@ dht_lk (call_frame_t *frame, xlator_t *this, /* TODO: for rebalance, we need to preserve the fop arguments */ STACK_WIND (frame, dht_lk_cbk, subvol, subvol->fops->lk, fd, - cmd, flock); + cmd, flock, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (lk, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); return 0; } @@ -857,7 +876,8 @@ err: /* Symlinks are currently not migrated, so no need for any check here */ int dht_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, const char *path, struct iatt *stbuf) + int op_ret, int op_errno, const char *path, + struct iatt *stbuf, dict_t *xdata) { dht_local_t *local = NULL; @@ -872,14 +892,15 @@ dht_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, err: DHT_STRIP_PHASE1_FLAGS (stbuf); - DHT_STACK_UNWIND (readlink, frame, op_ret, op_errno, path, stbuf); + DHT_STACK_UNWIND (readlink, frame, op_ret, op_errno, path, stbuf, xdata); return 0; } int -dht_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size) +dht_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -907,13 +928,13 @@ dht_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size) STACK_WIND (frame, dht_readlink_cbk, subvol, subvol->fops->readlink, - loc, size); + loc, size, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL); + DHT_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -924,16 +945,16 @@ err: int dht_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict) + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - DHT_STACK_UNWIND (xattrop, frame, op_ret, op_errno, dict); + DHT_STACK_UNWIND (xattrop, frame, op_ret, op_errno, dict, xdata); return 0; } int dht_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t flags, dict_t *dict) + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -964,13 +985,13 @@ dht_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, STACK_WIND (frame, dht_xattrop_cbk, subvol, subvol->fops->xattrop, - loc, flags, dict); + loc, flags, dict, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); return 0; } @@ -978,16 +999,16 @@ err: int dht_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict) + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - DHT_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, dict); + DHT_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, dict, xdata); return 0; } int dht_fxattrop (call_frame_t *frame, xlator_t *this, - fd_t *fd, gf_xattrop_flags_t flags, dict_t *dict) + fd_t *fd, gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -1007,13 +1028,13 @@ dht_fxattrop (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, dht_fxattrop_cbk, subvol, subvol->fops->fxattrop, - fd, flags, dict); + fd, flags, dict, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); return 0; } @@ -1021,17 +1042,17 @@ err: int dht_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - DHT_STACK_UNWIND (inodelk, frame, op_ret, op_errno); + DHT_STACK_UNWIND (inodelk, frame, op_ret, op_errno, xdata); return 0; } int32_t -dht_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *lock) +dht_inodelk (call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, int32_t cmd, struct gf_flock *lock, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -1063,31 +1084,31 @@ dht_inodelk (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, dht_inodelk_cbk, subvol, subvol->fops->inodelk, - volume, loc, cmd, lock); + volume, loc, cmd, lock, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (inodelk, frame, -1, op_errno); + DHT_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); return 0; } int -dht_finodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) +dht_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - DHT_STACK_UNWIND (finodelk, frame, op_ret, op_errno); + DHT_STACK_UNWIND (finodelk, frame, op_ret, op_errno, xdata); return 0; } int -dht_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *lock) +dht_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, + fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -1105,16 +1126,14 @@ dht_finodelk (call_frame_t *frame, xlator_t *this, } - STACK_WIND (frame, - dht_finodelk_cbk, - subvol, subvol->fops->finodelk, - volume, fd, cmd, lock); + STACK_WIND (frame, dht_finodelk_cbk, subvol, subvol->fops->finodelk, + volume, fd, cmd, lock, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (finodelk, frame, -1, op_errno); + DHT_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); return 0; } diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c index f9dea75aa..4b3f3a049 100644 --- a/xlators/cluster/dht/src/dht-inode-write.c +++ b/xlators/cluster/dht/src/dht-inode-write.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -28,16 +19,20 @@ int dht_writev2 (xlator_t *this, call_frame_t *frame, int ret); int dht_truncate2 (xlator_t *this, call_frame_t *frame, int ret); int dht_setattr2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret); +int dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret); +int dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret); int dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { dht_local_t *local = NULL; int ret = -1; + xlator_t *subvol = NULL; - if (op_ret == -1) { + if (op_ret == -1 && (op_errno != ENOENT)) { goto out; } @@ -59,6 +54,7 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->rebalance.target_op_fn = dht_writev2; + local->op_errno = op_errno; /* Phase 2 of migration */ if (IS_DHT_MIGRATION_PHASE2 (postbuf)) { ret = dht_rebalance_complete_check (this, frame); @@ -71,8 +67,8 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dht_iatt_merge (this, &local->stbuf, postbuf, NULL); dht_iatt_merge (this, &local->prebuf, prebuf, NULL); - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret) { + ret = dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { dht_writev2 (this, frame, 0); return 0; } @@ -85,7 +81,8 @@ out: DHT_STRIP_PHASE1_FLAGS (postbuf); DHT_STRIP_PHASE1_FLAGS (prebuf); - DHT_STACK_UNWIND (writev, frame, op_ret, op_errno, prebuf, postbuf); + DHT_STACK_UNWIND (writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); return 0; } @@ -95,14 +92,10 @@ dht_writev2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -112,15 +105,16 @@ dht_writev2 (xlator_t *this, call_frame_t *frame, int op_ret) STACK_WIND (frame, dht_writev_cbk, subvol, subvol->fops->writev, local->fd, local->rebalance.vector, local->rebalance.count, - local->rebalance.offset, local->rebalance.iobref); + local->rebalance.offset, local->rebalance.flags, + local->rebalance.iobref, NULL); return 0; } int -dht_writev (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iovec *vector, int count, off_t off, - struct iobref *iobref) +dht_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int count, off_t off, uint32_t flags, + struct iobref *iobref, dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -149,18 +143,19 @@ dht_writev (call_frame_t *frame, xlator_t *this, local->rebalance.vector = iov_dup (vector, count); local->rebalance.offset = off; local->rebalance.count = count; + local->rebalance.flags = flags; local->rebalance.iobref = iobref_ref (iobref); local->call_cnt = 1; STACK_WIND (frame, dht_writev_cbk, subvol, subvol->fops->writev, - fd, vector, count, off, iobref); + fd, vector, count, off, flags, iobref, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL); + DHT_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -170,11 +165,13 @@ err: int dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + xlator_t *subvol = NULL; + inode_t *inode = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -204,6 +201,7 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->rebalance.target_op_fn = dht_truncate2; + local->op_errno = op_errno; /* Phase 2 of migration */ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { ret = dht_rebalance_complete_check (this, frame); @@ -215,8 +213,9 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { dht_iatt_merge (this, &local->stbuf, postbuf, NULL); dht_iatt_merge (this, &local->prebuf, prebuf, NULL); - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret) { + inode = (local->fd) ? local->fd->inode : local->loc.inode; + dht_inode_ctx_get1 (this, inode, &subvol); + if (subvol) { dht_truncate2 (this, frame, 0); return 0; } @@ -229,7 +228,7 @@ out: DHT_STRIP_PHASE1_FLAGS (postbuf); DHT_STRIP_PHASE1_FLAGS (prebuf); DHT_STACK_UNWIND (truncate, frame, op_ret, op_errno, - prebuf, postbuf); + prebuf, postbuf, xdata); err: return 0; } @@ -240,16 +239,13 @@ dht_truncate2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; + inode_t *inode = NULL; local = frame->local; - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + inode = local->fd ? local->fd->inode : local->loc.inode; + dht_inode_ctx_get1 (this, inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -258,18 +254,19 @@ dht_truncate2 (xlator_t *this, call_frame_t *frame, int op_ret) if (local->fop == GF_FOP_TRUNCATE) { STACK_WIND (frame, dht_truncate_cbk, subvol, subvol->fops->truncate, &local->loc, - local->rebalance.offset); + local->rebalance.offset, NULL); } else { STACK_WIND (frame, dht_truncate_cbk, subvol, subvol->fops->ftruncate, local->fd, - local->rebalance.offset); + local->rebalance.offset, NULL); } return 0; } int -dht_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) +dht_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -299,19 +296,20 @@ dht_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) STACK_WIND (frame, dht_truncate_cbk, subvol, subvol->fops->truncate, - loc, offset); + loc, offset, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL); + DHT_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); return 0; } int -dht_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) +dht_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { xlator_t *subvol = NULL; int op_errno = -1; @@ -339,22 +337,423 @@ dht_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) STACK_WIND (frame, dht_truncate_cbk, subvol, subvol->fops->ftruncate, - fd, offset); + fd, offset, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL); + DHT_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); return 0; } + +int +dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + xlator_t *subvol = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + local->rebalance.target_op_fn = dht_fallocate2; + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { + dht_fallocate2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (fallocate, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + +int +dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_fallocate_cbk, subvol, subvol->fops->fallocate, + local->fd, local->rebalance.flags, local->rebalance.offset, + local->rebalance.size, NULL); + + return 0; +} + +int +dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FALLOCATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.flags = mode; + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_fallocate_cbk, + subvol, subvol->fops->fallocate, + fd, mode, offset, len, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + +int +dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + xlator_t *subvol = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + local->rebalance.target_op_fn = dht_discard2; + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { + dht_discard2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (discard, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + +int +dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_discard_cbk, subvol, subvol->fops->discard, + local->fd, local->rebalance.offset, local->rebalance.size, + NULL); + + return 0; +} + +int +dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_DISCARD); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_discard_cbk, subvol, subvol->fops->discard, + fd, offset, len, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +int +dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + local->rebalance.target_op_fn = dht_zerofill2; + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + ret = fd_ctx_get (local->fd, this, NULL); + if (!ret) { + dht_zerofill2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (zerofill, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + +int +dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + uint64_t tmp_subvol = 0; + int ret = -1; + + local = frame->local; + + if (local->fd) + ret = fd_ctx_get (local->fd, this, &tmp_subvol); + if (!ret) + subvol = (xlator_t *)(long)tmp_subvol; + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill, + local->fd, local->rebalance.offset, local->rebalance.size, + NULL); + + return 0; +} + +int +dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_ZEROFILL); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill, + fd, offset, len, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + + /* handle cases of migration here for 'setattr()' calls */ int dht_file_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -391,7 +790,7 @@ out: DHT_STRIP_PHASE1_FLAGS (postbuf); DHT_STRIP_PHASE1_FLAGS (prebuf); DHT_STACK_UNWIND (setattr, frame, op_ret, op_errno, - prebuf, postbuf); + prebuf, postbuf, xdata); return 0; } @@ -401,15 +800,13 @@ dht_setattr2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; + inode_t *inode = NULL; local = frame->local; - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + inode = (local->fd) ? local->fd->inode : local->loc.inode; + + dht_inode_ctx_get1 (this, inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -419,11 +816,13 @@ dht_setattr2 (xlator_t *this, call_frame_t *frame, int op_ret) if (local->fop == GF_FOP_SETATTR) { STACK_WIND (frame, dht_file_setattr_cbk, subvol, subvol->fops->setattr, &local->loc, - &local->rebalance.stbuf, local->rebalance.flags); + &local->rebalance.stbuf, local->rebalance.flags, + NULL); } else { STACK_WIND (frame, dht_file_setattr_cbk, subvol, subvol->fops->fsetattr, local->fd, - &local->rebalance.stbuf, local->rebalance.flags); + &local->rebalance.stbuf, local->rebalance.flags, + NULL); } return 0; @@ -434,7 +833,7 @@ dht_setattr2 (xlator_t *this, call_frame_t *frame, int op_ret) int dht_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *statpre, - struct iatt *statpost) + struct iatt *statpost, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; @@ -465,7 +864,7 @@ unlock: this_call_cnt = dht_frame_return (frame); if (is_last_call (this_call_cnt)) DHT_STACK_UNWIND (setattr, frame, local->op_ret, local->op_errno, - &local->prebuf, &local->stbuf); + &local->prebuf, &local->stbuf, xdata); return 0; } @@ -473,13 +872,14 @@ unlock: int dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid) + struct iatt *stbuf, int32_t valid, dict_t *xdata) { xlator_t *subvol = NULL; dht_layout_t *layout = NULL; dht_local_t *local = NULL; int op_errno = -1; int i = -1; + int call_cnt = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -518,25 +918,25 @@ dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, STACK_WIND (frame, dht_file_setattr_cbk, subvol, subvol->fops->setattr, - loc, stbuf, valid); + loc, stbuf, valid, xdata); return 0; } - local->call_cnt = layout->cnt; + local->call_cnt = call_cnt = layout->cnt; - for (i = 0; i < layout->cnt; i++) { + for (i = 0; i < call_cnt; i++) { STACK_WIND (frame, dht_setattr_cbk, layout->list[i].xlator, layout->list[i].xlator->fops->setattr, - loc, stbuf, valid); + loc, stbuf, valid, xdata); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL); + DHT_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -544,13 +944,14 @@ err: int dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, - int32_t valid) + int32_t valid, dict_t *xdata) { xlator_t *subvol = NULL; dht_layout_t *layout = NULL; dht_local_t *local = NULL; int op_errno = -1; int i = -1; + int call_cnt = 0; VALIDATE_OR_GOTO (frame, err); @@ -588,25 +989,25 @@ dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, STACK_WIND (frame, dht_file_setattr_cbk, subvol, subvol->fops->fsetattr, - fd, stbuf, valid); + fd, stbuf, valid, xdata); return 0; } - local->call_cnt = layout->cnt; + local->call_cnt = call_cnt = layout->cnt; - for (i = 0; i < layout->cnt; i++) { + for (i = 0; i < call_cnt; i++) { STACK_WIND (frame, dht_setattr_cbk, layout->list[i].xlator, layout->list[i].xlator->fops->fsetattr, - fd, stbuf, valid); + fd, stbuf, valid, xdata); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL); + DHT_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); return 0; } diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c index 7e1f7afda..38e9970a7 100644 --- a/xlators/cluster/dht/src/dht-layout.c +++ b/xlators/cluster/dht/src/dht-layout.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -68,9 +59,7 @@ dht_layout_t * dht_layout_get (xlator_t *this, inode_t *inode) { dht_conf_t *conf = NULL; - uint64_t layout_int = 0; dht_layout_t *layout = NULL; - int ret = -1; conf = this->private; if (!conf) @@ -78,9 +67,8 @@ dht_layout_get (xlator_t *this, inode_t *inode) LOCK (&conf->layout_lock); { - ret = inode_ctx_get (inode, this, &layout_int); - if (ret == 0) { - layout = (dht_layout_t *) (unsigned long) layout_int; + dht_inode_ctx_layout_get (inode, this, &layout); + if (layout) { layout->ref++; } } @@ -98,7 +86,6 @@ dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout) int oldret = -1; int ret = 0; dht_layout_t *old_layout; - uint64_t old_layout_int; conf = this->private; if (!conf) @@ -106,16 +93,13 @@ dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout) LOCK (&conf->layout_lock); { - oldret = inode_ctx_get (inode, this, &old_layout_int); - + oldret = dht_inode_ctx_layout_get (inode, this, &old_layout); layout->ref++; - ret = inode_ctx_put (inode, this, (uint64_t) (unsigned long) - layout); + dht_inode_ctx_layout_set (inode, this, layout); } UNLOCK (&conf->layout_lock); - if (oldret == 0) { - old_layout = (dht_layout_t *) (unsigned long) old_layout_int; + if (!oldret) { dht_layout_unref (this, old_layout); } @@ -130,7 +114,7 @@ dht_layout_unref (xlator_t *this, dht_layout_t *layout) dht_conf_t *conf = NULL; int ref = 0; - if (layout->preset || !this->private) + if (!layout || layout->preset || !this->private) return; conf = this->private; @@ -174,9 +158,9 @@ dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name) int ret = 0; - ret = dht_hash_compute (layout->type, name, &hash); + ret = dht_hash_compute (this, layout->type, name, &hash); if (ret != 0) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_WARNING, "hash computation failed for type=%d name=%s", layout->type, name); goto out; @@ -191,7 +175,7 @@ dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name) } if (!subvol) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_WARNING, "no subvolume for hash (value) = %u", hash); } @@ -280,6 +264,9 @@ dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, if (disk_layout_p) *disk_layout_p = disk_layout; + else + GF_FREE (disk_layout); + ret = 0; out: @@ -289,7 +276,7 @@ out: int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, - int pos, void *disk_layout_raw) + int pos, void *disk_layout_raw, int disk_layout_len) { int cnt = 0; int type = 0; @@ -297,19 +284,38 @@ dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, int stop_off = 0; int disk_layout[4]; - /* TODO: assert disk_layout_ptr is of required length */ + if (!disk_layout_raw) { + gf_log (this->name, GF_LOG_CRITICAL, + "error no layout on disk for merge"); + return -1; + } - memcpy (disk_layout, disk_layout_raw, sizeof (disk_layout)); + GF_ASSERT (disk_layout_len == sizeof (disk_layout)); + + memcpy (disk_layout, disk_layout_raw, disk_layout_len); cnt = ntoh32 (disk_layout[0]); if (cnt != 1) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_ERROR, "disk layout has invalid count %d", cnt); return -1; } - /* TODO: assert type is compatible */ - type = ntoh32 (disk_layout[1]); + type = ntoh32 (disk_layout[1]); + switch (type) { + case DHT_HASH_TYPE_DM_USER: + gf_log (this->name, GF_LOG_DEBUG, "found user-set layout"); + layout->type = type; + /* Fall through. */ + case DHT_HASH_TYPE_DM: + break; + default: + gf_log (this->name, GF_LOG_CRITICAL, + "Catastrophic error layout with unknown type found %d", + disk_layout[1]); + return -1; + } + start_off = ntoh32 (disk_layout[2]); stop_off = ntoh32 (disk_layout[3]); @@ -329,11 +335,12 @@ int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, int op_ret, int op_errno, dict_t *xattr) { - int i = 0; - int ret = -1; - int err = -1; - void *disk_layout_raw = NULL; - + int i = 0; + int ret = -1; + int err = -1; + void *disk_layout_raw = NULL; + int disk_layout_len = 0; + dht_conf_t *conf = this->private; if (op_ret != 0) { err = op_errno; @@ -354,12 +361,12 @@ dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, if (xattr) { /* during lookup and not mkdir */ - ret = dict_get_ptr (xattr, "trusted.glusterfs.dht", - &disk_layout_raw); + ret = dict_get_ptr_and_len (xattr, conf->xattr_name, + &disk_layout_raw, &disk_layout_len); } if (ret != 0) { - layout->list[i].err = -1; + layout->list[i].err = 0; gf_log (this->name, GF_LOG_TRACE, "missing disk layout on %s. err = %d", subvol->name, err); @@ -367,9 +374,10 @@ dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, goto out; } - ret = dht_disk_layout_merge (this, layout, i, disk_layout_raw); + ret = dht_disk_layout_merge (this, layout, i, disk_layout_raw, + disk_layout_len); if (ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_WARNING, "layout merge from subvolume %s failed", subvol->name); goto out; @@ -405,6 +413,22 @@ dht_layout_entry_swap (dht_layout_t *layout, int i, int j) layout->list[j].err = err_swap; } +void +dht_layout_range_swap (dht_layout_t *layout, int i, int j) +{ + uint32_t start_swap = 0; + uint32_t stop_swap = 0; + + start_swap = layout->list[i].start; + stop_swap = layout->list[i].stop; + + layout->list[i].start = layout->list[j].start; + layout->list[i].stop = layout->list[j].stop; + + layout->list[j].start = start_swap; + layout->list[j].stop = stop_swap; +} + int64_t dht_layout_entry_cmp_volname (dht_layout_t *layout, int i, int j) { @@ -412,17 +436,37 @@ dht_layout_entry_cmp_volname (dht_layout_t *layout, int i, int j) layout->list[j].xlator->name)); } + +gf_boolean_t +dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator) +{ + int i = 0; + + for (i = 0; i < layout->cnt; i++) { + if (!strcmp (layout->list[i].xlator->name, xlator->name)) + return _gf_true; + } + return _gf_false; +} + int64_t dht_layout_entry_cmp (dht_layout_t *layout, int i, int j) { int64_t diff = 0; + /* swap zero'ed out layouts to front, if needed */ + if (!layout->list[j].start && !layout->list[j].stop) { + diff = (int64_t) layout->list[i].stop + - (int64_t) layout->list[j].stop; + goto out; + } if (layout->list[i].err || layout->list[j].err) diff = layout->list[i].err - layout->list[j].err; else diff = (int64_t) layout->list[i].start - (int64_t) layout->list[j].start; +out: return diff; } @@ -471,7 +515,8 @@ dht_layout_sort_volname (dht_layout_t *layout) int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, uint32_t *holes_p, uint32_t *overlaps_p, - uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p) + uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p, + uint32_t *no_space_p) { uint32_t overlaps = 0; uint32_t missing = 0; @@ -484,30 +529,38 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, uint32_t prev_stop = 0; uint32_t last_stop = 0; char is_virgin = 1; + uint32_t no_space = 0; - /* TODO: explain WTF is happening */ + /* TODO: explain what is happening */ last_stop = layout->list[0].start - 1; prev_stop = last_stop; for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err) { - switch (layout->list[i].err) { - case -1: - case ENOENT: - missing++; - break; - case ENOTCONN: - down++; - break; - case ENOSPC: - down++; - break; - default: - misc++; + switch (layout->list[i].err) { + case -1: + case ENOENT: + missing++; + continue; + case ENOTCONN: + down++; + continue; + case ENOSPC: + no_space++; + continue; + case 0: + /* if err == 0 and start == stop, then it is a non misc++; + * participating subvolume(spread-cnt). Then, do not + * check for anomalies. If start != stop, then treat it + * as misc err */ + if (layout->list[i].start == layout->list[i].stop) { + continue; } + break; + default: + misc++; continue; - } + } is_virgin = 0; @@ -540,6 +593,9 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, if (misc_p) *misc_p = misc; + if (no_space_p) + *no_space_p = no_space; + return ret; } @@ -555,7 +611,6 @@ dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout) uint32_t down = 0; uint32_t misc = 0; - ret = dht_layout_sort (layout); if (ret == -1) { gf_log (this->name, GF_LOG_WARNING, @@ -565,7 +620,7 @@ dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout) ret = dht_layout_anomalies (this, loc, layout, &holes, &overlaps, - &missing, &down, &misc); + &missing, &down, &misc, NULL); if (ret == -1) { gf_log (this->name, GF_LOG_WARNING, "error while finding anomalies in %s -- not good news", @@ -583,43 +638,56 @@ dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout) "found anomalies in %s. holes=%d overlaps=%d", loc->path, holes, overlaps); } - ret = 1; + ret = -1; } for (i = 0; i < layout->cnt; i++) { - /* TODO During DHT selfheal rewrite (almost) find a better place to - * detect this - probably in dht_layout_anomalies() + /* TODO During DHT selfheal rewrite (almost) find a better place + * to detect this - probably in dht_layout_anomalies() */ if (layout->list[i].err > 0) { - gf_log (this->name, GF_LOG_DEBUG, - "path=%s err=%s on subvol=%s", - loc->path, strerror (layout->list[i].err), - (layout->list[i].xlator ? - layout->list[i].xlator->name : "<>")); - if (layout->list[i].err == ENOENT) - ret = 1; + gf_log_callingfn (this->name, GF_LOG_DEBUG, + "path=%s err=%s on subvol=%s", + loc->path, + strerror (layout->list[i].err), + (layout->list[i].xlator ? + layout->list[i].xlator->name + : "<>")); + if ((layout->list[i].err == ENOENT) && (ret >= 0)) { + ret++; + } } } + out: return ret; } +int +dht_dir_has_layout (dict_t *xattr, char *name) +{ + + void *disk_layout_raw = NULL; + + return dict_get_ptr (xattr, name, &disk_layout_raw); +} int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, loc_t *loc, dict_t *xattr) { - int idx = 0; - int pos = -1; - int ret = 0; - int err = 0; - int dict_ret = 0; - int32_t disk_layout[4]; - void *disk_layout_raw = NULL; - int32_t count = -1; - uint32_t start_off = -1; - uint32_t stop_off = -1; + int idx = 0; + int pos = -1; + int ret = 0; + int err = 0; + int dict_ret = 0; + int32_t disk_layout[4]; + void *disk_layout_raw = NULL; + int32_t count = -1; + uint32_t start_off = -1; + uint32_t stop_off = -1; + dht_conf_t *conf = this->private; for (idx = 0; idx < layout->cnt; idx++) { @@ -649,7 +717,7 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, goto out; } - dict_ret = dict_get_ptr (xattr, "trusted.glusterfs.dht", + dict_ret = dict_get_ptr (xattr, conf->xattr_name, &disk_layout_raw); if (dict_ret < 0) { @@ -665,7 +733,7 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, count = ntoh32 (disk_layout[0]); if (count != 1) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_ERROR, "%s - disk layout has invalid count %d", loc->path, count); ret = -1; @@ -714,7 +782,7 @@ dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode) LOCK (&conf->layout_lock); { - inode_ctx_put (inode, this, (uint64_t)(long)layout); + dht_inode_ctx_layout_set (inode, this, layout); } UNLOCK (&conf->layout_lock); diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c index 2186b064a..dbc9d0b3c 100644 --- a/xlators/cluster/dht/src/dht-linkfile.c +++ b/xlators/cluster/dht/src/dht-linkfile.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -28,37 +19,106 @@ #include "compat.h" #include "dht-common.h" +int +dht_linkfile_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) +{ + char is_linkfile = 0; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + local = frame->local; + prev = cookie; + conf = this->private; + + if (op_ret) + goto out; + + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); + if (!is_linkfile) + gf_log (this->name, GF_LOG_WARNING, "got non-linkfile %s:%s", + prev->this->name, local->loc.path); +out: + local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno, + inode, stbuf, postparent, postparent, + xattr); + return 0; +} +#define is_equal(a, b) (a == b) int dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; + xlator_t *subvol = NULL; + call_frame_t *prev = NULL; + dict_t *xattrs = NULL; + dht_conf_t *conf = NULL; + int ret = -1; local = frame->local; + if (!op_ret) + local->linked = _gf_true; + + FRAME_SU_UNDO (frame, dht_local_t); + + if (op_ret && (op_errno == EEXIST)) { + conf = this->private; + prev = cookie; + subvol = prev->this; + if (!subvol) + goto out; + xattrs = dict_new (); + if (!xattrs) + goto out; + ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set linkto key"); + goto out; + } + + STACK_WIND (frame, dht_linkfile_lookup_cbk, subvol, + subvol->fops->lookup, &local->loc, xattrs); + if (xattrs) + dict_unref (xattrs); + return 0; + } +out: local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno, - inode, stbuf, preparent, postparent); + inode, stbuf, preparent, postparent, + xdata); + if (xattrs) + dict_unref (xattrs); return 0; } int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, + xlator_t *this, xlator_t *tovol, xlator_t *fromvol, loc_t *loc) { dht_local_t *local = NULL; dict_t *dict = NULL; int need_unref = 0; int ret = 0; + dht_conf_t *conf = this->private; local = frame->local; local->linkfile.linkfile_cbk = linkfile_cbk; local->linkfile.srcvol = tovol; + local->linked = _gf_false; + dict = local->params; if (!dict) { dict = dict_new (); @@ -74,8 +134,12 @@ dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, "%s: gfid set failed", loc->path); } - ret = dict_set_str (dict, "trusted.glusterfs.dht.linkto", - tovol->name); + ret = dict_set_str (dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); + if (ret) + gf_log ("dht-linkfile", GF_LOG_INFO, + "%s: internal-fop set failed", loc->path); + + ret = dict_set_str (dict, conf->link_xattr_name, tovol->name); if (ret < 0) { gf_log (frame->this->name, GF_LOG_INFO, @@ -84,9 +148,13 @@ dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, goto out; } + local->link_subvol = fromvol; + /* Always create as root:root. dht_linkfile_attr_heal fixes the + * ownsership */ + FRAME_SU_DO (frame, dht_local_t); STACK_WIND (frame, dht_linkfile_create_cbk, fromvol, fromvol->fops->mknod, loc, - S_IFREG | DHT_LINKFILE_MODE, 0, dict); + S_IFREG | DHT_LINKFILE_MODE, 0, 0, dict); if (need_unref && dict) dict_unref (dict); @@ -94,7 +162,7 @@ dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, return 0; out: local->linkfile.linkfile_cbk (frame, NULL, frame->this, -1, ENOMEM, - loc->inode, NULL, NULL, NULL); + loc->inode, NULL, NULL, NULL, NULL); if (need_unref && dict) dict_unref (dict); @@ -106,7 +174,8 @@ out: int dht_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -150,7 +219,7 @@ dht_linkfile_unlink (call_frame_t *frame, xlator_t *this, STACK_WIND (unlink_frame, dht_linkfile_unlink_cbk, subvol, subvol->fops->unlink, - &unlink_local->loc); + &unlink_local->loc, 0, NULL); return 0; err: @@ -175,7 +244,7 @@ dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *stbuf, if (!xattr) goto out; - ret = dict_get_ptr (xattr, "trusted.glusterfs.dht.linkto", &volname); + ret = dict_get_ptr (xattr, conf->link_xattr_name, &volname); if ((-1 == ret) || !volname) goto out; @@ -190,3 +259,70 @@ dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *stbuf, out: return subvol; } + +int +dht_linkfile_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ + dht_local_t *local = NULL; + loc_t *loc = NULL; + + local = frame->local; + loc = &local->loc; + + if (op_ret) + gf_log (this->name, GF_LOG_ERROR, "setattr of uid/gid on %s" + " :<gfid:%s> failed (%s)", + (loc->path? loc->path: "NULL"), + uuid_utoa(local->gfid), strerror(op_errno)); + + DHT_STACK_DESTROY (frame); + + return 0; +} + +int +dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this) +{ + int ret = -1; + call_frame_t *copy = NULL; + dht_local_t *local = NULL; + dht_local_t *copy_local = NULL; + xlator_t *subvol = NULL; + struct iatt stbuf = {0,}; + + local = frame->local; + + GF_VALIDATE_OR_GOTO ("dht", local, out); + GF_VALIDATE_OR_GOTO ("dht", local->link_subvol, out); + + if (local->stbuf.ia_type == IA_INVAL) + return 0; + + uuid_copy (local->loc.gfid, local->stbuf.ia_gfid); + + copy = copy_frame (frame); + + if (!copy) + goto out; + + copy_local = dht_local_init (copy, &local->loc, NULL, 0); + + if (!copy_local) + goto out; + + stbuf = local->stbuf; + subvol = local->link_subvol; + + copy->local = copy_local; + + FRAME_SU_DO (copy, dht_local_t); + + STACK_WIND (copy, dht_linkfile_setattr_cbk, subvol, + subvol->fops->setattr, ©_local->loc, + &stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL); + ret = 0; +out: + return ret; +} diff --git a/xlators/cluster/dht/src/dht-mem-types.h b/xlators/cluster/dht/src/dht-mem-types.h index 21fb5a7ca..e893eb48f 100644 --- a/xlators/cluster/dht/src/dht-mem-types.h +++ b/xlators/cluster/dht/src/dht-mem-types.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -28,7 +19,6 @@ enum gf_dht_mem_types_ { gf_dht_mt_dht_conf_t, gf_dht_mt_char, gf_dht_mt_int32_t, - gf_dht_mt_dht_local_t, gf_dht_mt_xlator_t, gf_dht_mt_dht_layout_t, gf_switch_mt_dht_conf_t, @@ -37,6 +27,9 @@ enum gf_dht_mem_types_ { gf_switch_mt_switch_struct, gf_dht_mt_subvol_time, gf_dht_mt_loc_t, + gf_defrag_info_mt, + gf_dht_mt_inode_ctx_t, + gf_dht_mt_ctx_stat_time_t, gf_dht_mt_end }; #endif diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index ab555720b..bcb19f23e 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -24,11 +15,12 @@ #endif #include "dht-common.h" +#include "xlator.h" +#include <fnmatch.h> #define GF_DISK_SECTOR_SIZE 512 #define DHT_REBALANCE_PID 4242 /* Change it if required */ #define DHT_REBALANCE_BLKSIZE (128 * 1024) -#define DHT_MIGRATE_EVEN_IF_LINK_EXISTS 1 static int dht_write_with_holes (xlator_t *to, fd_t *fd, struct iovec *vec, int count, @@ -60,7 +52,7 @@ dht_write_with_holes (xlator_t *to, fd_t *fd, struct iovec *vec, int count, ret = syncop_write (to, fd, (buf + tmp_offset), (start_idx - tmp_offset), (offset + tmp_offset), - iobref); + iobref, 0); /* 'path' will be logged in calling function */ if (ret < 0) { gf_log (THIS->name, GF_LOG_WARNING, @@ -78,7 +70,7 @@ dht_write_with_holes (xlator_t *to, fd_t *fd, struct iovec *vec, int count, /* This means, last chunk is not yet written.. write it */ ret = syncop_write (to, fd, (buf + tmp_offset), (buf_len - tmp_offset), - (offset + tmp_offset), iobref); + (offset + tmp_offset), iobref, 0); if (ret < 0) { /* 'path' will be logged in calling function */ gf_log (THIS->name, GF_LOG_WARNING, @@ -99,8 +91,118 @@ out: } +int32_t +gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs, + struct iatt *stbuf) +{ + int32_t ret = -1; + xlator_t *cached_subvol = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *linkto_subvol = NULL; + data_t *data = NULL; + struct iatt iatt = {0,}; + int32_t op_errno = 0; + dht_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO ("defrag", loc, out); + GF_VALIDATE_OR_GOTO ("defrag", loc->name, out); + GF_VALIDATE_OR_GOTO ("defrag", stbuf, out); + GF_VALIDATE_OR_GOTO ("defrag", this, out); + GF_VALIDATE_OR_GOTO ("defrag", xattrs, out); + GF_VALIDATE_OR_GOTO ("defrag", this->private, out); + + conf = this->private; + + if (uuid_is_null (loc->pargfid)) { + gf_log ("", GF_LOG_ERROR, "loc->pargfid is NULL for " + "%s", loc->path); + goto out; + } + + if (uuid_is_null (loc->gfid)) { + gf_log ("", GF_LOG_ERROR, "loc->gfid is NULL for " + "%s", loc->path); + goto out; + } + + cached_subvol = dht_subvol_get_cached (this, loc->inode); + if (!cached_subvol) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get cached subvol" + " for %s on %s", loc->name, this->name); + goto out; + } + + hashed_subvol = dht_subvol_get_hashed (this, loc); + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get hashed subvol" + " for %s on %s", loc->name, this->name); + goto out; + } + + gf_log (this->name, GF_LOG_INFO, "Attempting to migrate hardlink %s " + "with gfid %s from %s -> %s", loc->name, uuid_utoa (loc->gfid), + cached_subvol->name, hashed_subvol->name); + data = dict_get (xattrs, conf->link_xattr_name); + /* set linkto on cached -> hashed if not present, else link it */ + if (!data) { + ret = dict_set_str (xattrs, conf->link_xattr_name, + hashed_subvol->name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set " + "linkto xattr in dict for %s", loc->name); + goto out; + } + + ret = syncop_setxattr (cached_subvol, loc, xattrs, 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Linkto setxattr " + "failed %s -> %s (%s)", cached_subvol->name, + loc->name, strerror (errno)); + goto out; + } + goto out; + } else { + linkto_subvol = dht_linkfile_subvol (this, NULL, NULL, xattrs); + if (!linkto_subvol) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "linkto subvol for %s", loc->name); + } else { + hashed_subvol = linkto_subvol; + } + + ret = syncop_link (hashed_subvol, loc, loc); + if (ret) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "link of %s -> %s" + " failed on subvol %s (%s)", loc->name, + uuid_utoa(loc->gfid), + hashed_subvol->name, strerror (op_errno)); + if (op_errno != EEXIST) + goto out; + } + } + ret = syncop_lookup (hashed_subvol, loc, NULL, &iatt, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed lookup %s on %s (%s)" + , loc->name, hashed_subvol->name, strerror (errno)); + goto out; + } + + if (iatt.ia_nlink == stbuf->ia_nlink) { + ret = dht_migrate_file (this, loc, cached_subvol, hashed_subvol, + GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS); + if (ret) + goto out; + } + ret = 0; +out: + return ret; +} + + static inline int -__is_file_migratable (xlator_t *this, loc_t *loc, struct iatt *stbuf) +__is_file_migratable (xlator_t *this, loc_t *loc, + struct iatt *stbuf, dict_t *xattrs, int flags) { int ret = -1; @@ -111,11 +213,25 @@ __is_file_migratable (xlator_t *this, loc_t *loc, struct iatt *stbuf) goto out; } + if (flags == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS) { + ret = 0; + goto out; + } if (stbuf->ia_nlink > 1) { - /* TODO : support migrating hardlinks */ - gf_log (this->name, GF_LOG_WARNING, "%s: file has hardlinks", - loc->path); - ret = -ENOTSUP; + /* support for decomission */ + if (flags == GF_DHT_MIGRATE_HARDLINK) { + ret = gf_defrag_handle_hardlink (this, loc, + xattrs, stbuf); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to migrate file with link", + loc->path); + } + } else { + gf_log (this->name, GF_LOG_WARNING, + "%s: file has hardlinks", loc->path); + } + ret = ENOTSUP; goto out; } @@ -127,14 +243,16 @@ out: static inline int __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struct iatt *stbuf, - dict_t *dict, fd_t **dst_fd) + dict_t *dict, fd_t **dst_fd, dict_t *xattr) { - xlator_t *this = NULL; - int ret = -1; - fd_t *fd = NULL; - struct iatt new_stbuf = {0,}; + xlator_t *this = NULL; + int ret = -1; + fd_t *fd = NULL; + struct iatt new_stbuf = {0,}; + dht_conf_t *conf = NULL; this = THIS; + conf = this->private; ret = dict_set_static_bin (dict, "gfid-req", stbuf->ia_gfid, 16); if (ret) { @@ -143,7 +261,7 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc goto out; } - ret = dict_set_str (dict, DHT_LINKFILE_KEY, from->name); + ret = dict_set_str (dict, conf->link_xattr_name, from->name); if (ret) { gf_log (this->name, GF_LOG_ERROR, "%s: failed to set gfid in dict for create", loc->path); @@ -181,7 +299,7 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc /* Create the destination with LINKFILE mode, and linkto xattr, if the linkfile already exists, it will just open the file */ ret = syncop_create (to, loc, O_RDWR, DHT_LINKFILE_MODE, fd, - dict); + dict, &new_stbuf); if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, "failed to create %s on %s (%s)", @@ -189,6 +307,26 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc goto out; } + ret = syncop_fsetxattr (to, fd, xattr, 0); + if (ret == -1) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set xattr on %s (%s)", + loc->path, to->name, strerror (errno)); + + ret = syncop_ftruncate (to, fd, stbuf->ia_size); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "ftruncate failed for %s on %s (%s)", + loc->path, to->name, strerror (errno)); + + ret = syncop_fsetattr (to, fd, stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID), + NULL, NULL); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "chown failed for %s on %s (%s)", + loc->path, to->name, strerror (errno)); + if (dst_fd) *dst_fd = fd; @@ -201,13 +339,16 @@ out: static inline int __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, - struct iatt *stbuf) + struct iatt *stbuf, int flag) { struct statvfs src_statfs = {0,}; struct statvfs dst_statfs = {0,}; int ret = -1; xlator_t *this = NULL; + uint64_t src_statfs_blocks = 1; + uint64_t dst_statfs_blocks = 1; + this = THIS; ret = syncop_statfs (from, loc, &src_statfs); @@ -225,18 +366,47 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, loc->path, to->name, strerror (errno)); goto out; } - if (((dst_statfs.f_bavail * - dst_statfs.f_bsize) / GF_DISK_SECTOR_SIZE) > - (((src_statfs.f_bavail * src_statfs.f_bsize) / - GF_DISK_SECTOR_SIZE) - stbuf->ia_blocks)) { - gf_log (this->name, GF_LOG_WARNING, - "data movement attempted from node (%s) with" - " higher disk space to a node (%s) with " - "lesser disk space (%s)", from->name, - to->name, loc->path); - /* this is not a 'failure', but we don't want to - consider this as 'success' too :-/ */ + /* if force option is given, do not check for space @ dst. + * Check only if space is avail for the file */ + if (flag != GF_DHT_MIGRATE_DATA) + goto check_avail_space; + + /* Check: + During rebalance `migrate-data` - Destination subvol experiences + a `reduction` in 'blocks' of free space, at the same time source + subvol gains certain 'blocks' of free space. A valid check is + necessary here to avoid errorneous move to destination where + the space could be scantily available. + */ + if (stbuf) { + dst_statfs_blocks = ((dst_statfs.f_bavail * + dst_statfs.f_bsize) / + GF_DISK_SECTOR_SIZE); + src_statfs_blocks = ((src_statfs.f_bavail * + src_statfs.f_bsize) / + GF_DISK_SECTOR_SIZE); + if ((dst_statfs_blocks - stbuf->ia_blocks) < + (src_statfs_blocks + stbuf->ia_blocks)) { + gf_log (this->name, GF_LOG_WARNING, + "data movement attempted from node (%s) with" + " higher disk space to a node (%s) with " + "lesser disk space (%s)", from->name, + to->name, loc->path); + + /* this is not a 'failure', but we don't want to + consider this as 'success' too :-/ */ + ret = 1; + goto out; + } + } +check_avail_space: + if (((dst_statfs.f_bavail * dst_statfs.f_bsize) / + GF_DISK_SECTOR_SIZE) < stbuf->ia_blocks) { + gf_log (this->name, GF_LOG_ERROR, + "data movement attempted from node (%s) with " + "to node (%s) which does not have required free space" + " for %s", from->name, to->name, loc->path); ret = 1; goto out; } @@ -247,7 +417,7 @@ out: } static inline int -__dht_rebalane_migrate_data (xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst, +__dht_rebalance_migrate_data (xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst, uint64_t ia_size, int hole_exists) { int ret = 0; @@ -263,7 +433,7 @@ __dht_rebalane_migrate_data (xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst, read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE) ? DHT_REBALANCE_BLKSIZE : (ia_size - total)); ret = syncop_readv (from, src, read_size, - offset, &vector, &count, &iobref); + offset, 0, &vector, &count, &iobref); if (!ret || (ret < 0)) { break; } @@ -273,15 +443,14 @@ __dht_rebalane_migrate_data (xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst, ret, offset, iobref); else ret = syncop_writev (to, dst, vector, count, - offset, iobref); + offset, iobref, 0); if (ret < 0) { break; } offset += ret; total += ret; - if (vector) - GF_FREE (vector); + GF_FREE (vector); if (iobref) iobref_unref (iobref); iobref = NULL; @@ -289,8 +458,7 @@ __dht_rebalane_migrate_data (xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst, } if (iobref) iobref_unref (iobref); - if (vector) - GF_FREE (vector); + GF_FREE (vector); if (ret >= 0) ret = 0; @@ -308,8 +476,10 @@ __dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc, dict_t *dict = NULL; xlator_t *this = NULL; struct iatt iatt = {0,}; + dht_conf_t *conf = NULL; this = THIS; + conf = this->private; fd = fd_create (loc->inode, DHT_REBALANCE_PID); if (!fd) { @@ -332,7 +502,7 @@ __dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc, if (!dict) goto out; - ret = dict_set_str (dict, DHT_LINKFILE_KEY, to->name); + ret = dict_set_str (dict, conf->link_xattr_name, to->name); if (ret) { gf_log (this->name, GF_LOG_ERROR, "failed to set xattr in dict for %s (linkto:%s)", @@ -385,12 +555,13 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc, dict_t *dict = NULL; char *link = NULL; struct iatt stbuf = {0,}; + dht_conf_t *conf = this->private; dict = dict_new (); if (!dict) goto out; - ret = dict_set_int32 (dict, DHT_LINKFILE_KEY, 256); + ret = dict_set_int32 (dict, conf->link_xattr_name, 256); if (ret) { gf_log (this->name, GF_LOG_ERROR, "%s: failed to set 'linkto' key in dict", loc->path); @@ -406,12 +577,13 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc, } /* we no more require this key */ - dict_del (dict, DHT_LINKFILE_KEY); + dict_del (dict, conf->link_xattr_name); /* file exists in target node, only if it is 'linkfile' its valid, otherwise, error out */ if (!ret) { - if (!check_is_linkfile (loc->inode, &stbuf, rsp_dict)) { + if (!check_is_linkfile (loc->inode, &stbuf, rsp_dict, + conf->link_xattr_name)) { gf_log (this->name, GF_LOG_WARNING, "%s: file exists in destination", loc->path); ret = -1; @@ -447,7 +619,7 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc, goto out; } - ret = syncop_symlink (to, loc, link, dict); + ret = syncop_symlink (to, loc, link, dict, 0); if (ret) { gf_log (this->name, GF_LOG_WARNING, "%s: creating symlink failed (%s)", @@ -461,7 +633,7 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc, ret = syncop_mknod (to, loc, st_mode_from_ia (buf->ia_prot, buf->ia_type), makedev (ia_major (buf->ia_rdev), - ia_minor (buf->ia_rdev)), dict); + ia_minor (buf->ia_rdev)), dict, 0); if (ret) { gf_log (this->name, GF_LOG_WARNING, "%s: mknod failed (%s)", loc->path, strerror (errno)); @@ -469,6 +641,15 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc, } done: + ret = syncop_setattr (to, loc, buf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID | + GF_SET_ATTR_MODE), NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform setattr on %s (%s)", + loc->path, to->name, strerror (errno)); + } + ret = syncop_unlink (from, loc); if (ret) gf_log (this->name, GF_LOG_WARNING, "%s: unlink failed (%s)", @@ -504,7 +685,9 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, fd_t *dst_fd = NULL; dict_t *dict = NULL; dict_t *xattr = NULL; + dict_t *xattr_rsp = NULL; int file_has_holes = 0; + dht_conf_t *conf = this->private; gf_log (this->name, GF_LOG_INFO, "%s: attempting to move from %s to %s", loc->path, from->name, to->name); @@ -513,19 +696,29 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, if (!dict) goto out; + ret = dict_set_int32 (dict, conf->link_xattr_name, 256); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set 'linkto' key in dict", loc->path); + goto out; + } + /* Phase 1 - Data migration is in progress from now on */ - ret = syncop_lookup (from, loc, NULL, &stbuf, NULL, NULL); + ret = syncop_lookup (from, loc, dict, &stbuf, &xattr_rsp, NULL); if (ret) { gf_log (this->name, GF_LOG_ERROR, "%s: lookup failed on %s (%s)", loc->path, from->name, strerror (errno)); goto out; } + /* we no more require this key */ + dict_del (dict, conf->link_xattr_name); + /* preserve source mode, so set the same to the destination */ src_ia_prot = stbuf.ia_prot; /* Check if file can be migrated */ - ret = __is_file_migratable (this, loc, &stbuf); + ret = __is_file_migratable (this, loc, &stbuf, xattr_rsp, flag); if (ret) goto out; @@ -536,18 +729,22 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } + /* TODO: move all xattr related operations to fd based operations */ + ret = syncop_listxattr (from, loc, &xattr); + if (ret == -1) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to get xattr from %s (%s)", + loc->path, from->name, strerror (errno)); + /* create the destination, with required modes/xattr */ ret = __dht_rebalance_create_dst_file (to, from, loc, &stbuf, - dict, &dst_fd); + dict, &dst_fd, xattr); if (ret) goto out; - /* Should happen on all files when 'force' option is not given */ - if (flag != DHT_MIGRATE_EVEN_IF_LINK_EXISTS) { - ret = __dht_check_free_space (to, from, loc, &stbuf); - if (ret) { - goto out; - } + ret = __dht_check_free_space (to, from, loc, &stbuf, flag); + if (ret) { + goto out; } /* Open the source, and also update mode/xattr */ @@ -558,6 +755,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } + ret = syncop_fstat (from, src_fd, &stbuf); if (ret) { gf_log (this->name, GF_LOG_ERROR, "failed to lookup %s on %s (%s)", @@ -570,8 +768,8 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, file_has_holes = 1; /* All I/O happens in this function */ - ret = __dht_rebalane_migrate_data (from, to, src_fd, dst_fd, - stbuf.ia_size, file_has_holes); + ret = __dht_rebalance_migrate_data (from, to, src_fd, dst_fd, + stbuf.ia_size, file_has_holes); if (ret) { gf_log (this->name, GF_LOG_ERROR, "%s: failed to migrate data", loc->path); @@ -587,22 +785,9 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } - /* TODO: move all xattr related operations to fd based operations */ - ret = syncop_listxattr (from, loc, &xattr); - if (ret == -1) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to get xattr from %s (%s)", - loc->path, from->name, strerror (errno)); - - ret = syncop_setxattr (to, loc, xattr, 0); - if (ret == -1) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set xattr on %s (%s)", - loc->path, to->name, strerror (errno)); - /* TODO: Sync the locks */ - ret = syncop_fsync (to, dst_fd); + ret = syncop_fsync (to, dst_fd, 0); if (ret) gf_log (this->name, GF_LOG_WARNING, "%s: failed to fsync on %s (%s)", @@ -639,6 +824,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, gf_log (this->name, GF_LOG_WARNING, "%s: failed to perform setattr on %s (%s)", loc->path, to->name, strerror (errno)); + goto out; } /* Because 'futimes' is not portable */ @@ -659,6 +845,24 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, gf_log (this->name, GF_LOG_WARNING, \ "%s: failed to perform setattr on %s (%s)", loc->path, from->name, strerror (errno)); + goto out; + } + + /* Free up the data blocks on the source node, as the whole + file is migrated */ + ret = syncop_ftruncate (from, src_fd, 0); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform truncate on %s (%s)", + loc->path, from->name, strerror (errno)); + } + + /* remove the 'linkto' xattr from the destination */ + ret = syncop_fremovexattr (to, dst_fd, conf->link_xattr_name); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform removexattr on %s (%s)", + loc->path, to->name, strerror (errno)); } /* Do a stat and check the gfid before unlink */ @@ -667,38 +871,23 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, gf_log (this->name, GF_LOG_WARNING, "%s: failed to do a stat on %s (%s)", loc->path, from->name, strerror (errno)); + goto out; } - if (uuid_compare (empty_iatt.ia_gfid, loc->inode->gfid) == 0) { + if (uuid_compare (empty_iatt.ia_gfid, loc->gfid) == 0) { /* take out the source from namespace */ ret = syncop_unlink (from, loc); if (ret) { gf_log (this->name, GF_LOG_WARNING, "%s: failed to perform unlink on %s (%s)", loc->path, from->name, strerror (errno)); + goto out; } } - /* Free up the data blocks on the source node, as the whole - file is migrated */ - ret = syncop_ftruncate (from, src_fd, 0); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to perform truncate on %s (%s)", - loc->path, from->name, strerror (errno)); - } - - /* remove the 'linkto' xattr from the destination */ - ret = syncop_removexattr (to, loc, DHT_LINKFILE_KEY); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to perform removexattr on %s (%s)", - loc->path, to->name, strerror (errno)); - } - ret = syncop_lookup (this, loc, NULL, NULL, NULL, NULL); if (ret) { - gf_log (this->name, GF_LOG_WARNING, + gf_log (this->name, GF_LOG_DEBUG, "%s: failed to lookup the file on subvolumes (%s)", loc->path, strerror (errno)); } @@ -714,6 +903,8 @@ out: if (xattr) dict_unref (xattr); + if (xattr_rsp) + dict_unref (xattr_rsp); if (dst_fd) syncop_close (dst_fd); @@ -788,7 +979,7 @@ rebalance_task_completion (int op_ret, call_frame_t *sync_frame, void *data) op_errno = EPERM; } - DHT_STACK_UNWIND (setxattr, sync_frame, op_ret, op_errno); + DHT_STACK_UNWIND (setxattr, sync_frame, op_ret, op_errno, NULL); return 0; } @@ -796,12 +987,829 @@ int dht_start_rebalance_task (xlator_t *this, call_frame_t *frame) { int ret = -1; - dht_conf_t *conf = NULL; - conf = this->private; - - ret = synctask_new (conf->env, rebalance_task, + ret = synctask_new (this->ctx->env, rebalance_task, rebalance_task_completion, frame, frame); return ret; } + +int +gf_listener_stop (xlator_t *this) +{ + glusterfs_ctx_t *ctx = NULL; + cmd_args_t *cmd_args = NULL; + int ret = 0; + + ctx = this->ctx; + GF_ASSERT (ctx); + cmd_args = &ctx->cmd_args; + if (cmd_args->sock_file) { + ret = unlink (cmd_args->sock_file); + if (ret && (ENOENT == errno)) { + ret = 0; + } + } + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to unlink listener " + "socket %s, error: %s", cmd_args->sock_file, + strerror (errno)); + } + return ret; +} + +void +dht_build_root_inode (xlator_t *this, inode_t **inode) +{ + inode_table_t *itable = NULL; + uuid_t root_gfid = {0, }; + + itable = inode_table_new (0, this); + if (!itable) + return; + + root_gfid[15] = 1; + *inode = inode_find (itable, root_gfid); +} + +void +dht_build_root_loc (inode_t *inode, loc_t *loc) +{ + loc->path = "/"; + loc->inode = inode; + loc->inode->ia_type = IA_IFDIR; + memset (loc->gfid, 0, 16); + loc->gfid[15] = 1; +} + + +/* return values: 1 -> error, bug ignore and continue + 0 -> proceed + -1 -> error, handle it */ +int32_t +gf_defrag_handle_migrate_error (int32_t op_errno, gf_defrag_info_t *defrag) +{ + /* if errno is not ENOSPC or ENOTCONN, we can still continue + with rebalance process */ + if ((errno != ENOSPC) || (errno != ENOTCONN)) + return 1; + + if (errno == ENOTCONN) { + /* Most probably mount point went missing (mostly due + to a brick down), say rebalance failure to user, + let him restart it if everything is fine */ + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + return -1; + } + + if (errno == ENOSPC) { + /* rebalance process itself failed, may be + remote brick went down, or write failed due to + disk full etc etc.. */ + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + return -1; + } + + return 0; +} + +static gf_boolean_t +gf_defrag_pattern_match (gf_defrag_info_t *defrag, char *name, uint64_t size) +{ + gf_defrag_pattern_list_t *trav = NULL; + gf_boolean_t match = _gf_false; + gf_boolean_t ret = _gf_false; + + GF_VALIDATE_OR_GOTO ("dht", defrag, out); + + trav = defrag->defrag_pattern; + while (trav) { + if (!fnmatch (trav->path_pattern, name, FNM_NOESCAPE)) { + match = _gf_true; + break; + } + trav = trav->next; + } + + if ((match == _gf_true) && (size >= trav->size)) + ret = _gf_true; + + out: + return ret; +} + +/* We do a depth first traversal of directories. But before we move into + * subdirs, we complete the data migration of those directories whose layouts + * have been fixed + */ + +int +gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, + dict_t *migrate_data) +{ + int ret = -1; + loc_t entry_loc = {0,}; + fd_t *fd = NULL; + gf_dirent_t entries; + gf_dirent_t *tmp = NULL; + gf_dirent_t *entry = NULL; + gf_boolean_t free_entries = _gf_false; + off_t offset = 0; + dict_t *dict = NULL; + struct iatt iatt = {0,}; + int32_t op_errno = 0; + char *uuid_str = NULL; + uuid_t node_uuid = {0,}; + int readdir_operrno = 0; + struct timeval dir_start = {0,}; + struct timeval end = {0,}; + double elapsed = {0,}; + struct timeval start = {0,}; + int32_t err = 0; + + gf_log (this->name, GF_LOG_INFO, "migrate data called on %s", + loc->path); + gettimeofday (&dir_start, NULL); + + fd = fd_create (loc->inode, defrag->pid); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, "Failed to create fd"); + goto out; + } + + ret = syncop_opendir (this, loc, fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to open dir %s", + loc->path); + goto out; + } + + INIT_LIST_HEAD (&entries.list); + + while ((ret = syncop_readdirp (this, fd, 131072, offset, NULL, + &entries)) != 0) { + + if (ret < 0) { + + gf_log (this->name, GF_LOG_ERROR, "Readdir returned %s." + " Aborting migrate-data", + strerror(readdir_operrno)); + goto out; + } + + /* Need to keep track of ENOENT errno, that means, there is no + need to send more readdirp() */ + readdir_operrno = errno; + + if (list_empty (&entries.list)) + break; + + free_entries = _gf_true; + + list_for_each_entry_safe (entry, tmp, &entries.list, list) { + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { + ret = 1; + goto out; + } + + offset = entry->d_off; + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + if (IA_ISDIR (entry->d_stat.ia_type)) + continue; + + defrag->num_files_lookedup++; + if (defrag->stats == _gf_true) { + gettimeofday (&start, NULL); + } + if (defrag->defrag_pattern && + (gf_defrag_pattern_match (defrag, entry->d_name, + entry->d_stat.ia_size) + == _gf_false)) { + continue; + } + loc_wipe (&entry_loc); + ret =dht_build_child_loc (this, &entry_loc, loc, + entry->d_name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Child loc" + " build failed"); + goto out; + } + + if (uuid_is_null (entry->d_stat.ia_gfid)) { + gf_log (this->name, GF_LOG_ERROR, "%s/%s" + " gfid not present", loc->path, + entry->d_name); + continue; + } + + uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid); + + if (uuid_is_null (loc->gfid)) { + gf_log (this->name, GF_LOG_ERROR, "%s/%s" + " gfid not present", loc->path, + entry->d_name); + continue; + } + + uuid_copy (entry_loc.pargfid, loc->gfid); + + entry_loc.inode->ia_type = entry->d_stat.ia_type; + + ret = syncop_lookup (this, &entry_loc, NULL, &iatt, + NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s" + " lookup failed", entry_loc.path); + continue; + } + + ret = syncop_getxattr (this, &entry_loc, &dict, + GF_XATTR_NODE_UUID_KEY); + if(ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "get node-uuid for %s", entry_loc.path); + continue; + } + + ret = dict_get_str (dict, GF_XATTR_NODE_UUID_KEY, + &uuid_str); + if(ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "get node-uuid from dict for %s", + entry_loc.path); + continue; + } + + if (uuid_parse (uuid_str, node_uuid)) { + gf_log (this->name, GF_LOG_ERROR, "uuid_parse " + "failed for %s", entry_loc.path); + continue; + } + + /* if file belongs to different node, skip migration + * the other node will take responsibility of migration + */ + if (uuid_compare (node_uuid, defrag->node_uuid)) { + gf_log (this->name, GF_LOG_TRACE, "%s does not" + "belong to this node", entry_loc.path); + continue; + } + + uuid_str = NULL; + + dict_del (dict, GF_XATTR_NODE_UUID_KEY); + + + /* if distribute is present, it will honor this key. + * -1 is returned if distribute is not present or file + * doesn't have a link-file. If file has link-file, the + * path of link-file will be the value, and also that + * guarantees that file has to be mostly migrated */ + + ret = syncop_getxattr (this, &entry_loc, &dict, + GF_XATTR_LINKINFO_KEY); + if (ret < 0) { + gf_log (this->name, GF_LOG_TRACE, "failed to " + "get link-to key for %s", + entry_loc.path); + continue; + } + + ret = syncop_setxattr (this, &entry_loc, migrate_data, + 0); + if (ret) { + err = op_errno; + /* errno is overloaded. See + * rebalance_task_completion () */ + if (err != ENOSPC) { + gf_log (this->name, GF_LOG_DEBUG, + "migrate-data skipped for %s" + " due to space constraints", + entry_loc.path); + defrag->skipped +=1; + } else{ + gf_log (this->name, GF_LOG_ERROR, + "migrate-data failed for %s", + entry_loc.path); + defrag->total_failures +=1; + } + } + + if (ret == -1) { + op_errno = errno; + ret = gf_defrag_handle_migrate_error (op_errno, + defrag); + + if (!ret) + gf_log (this->name, GF_LOG_DEBUG, + "migrate-data on %s failed: %s", + entry_loc.path, + strerror (op_errno)); + else if (ret == 1) + continue; + else if (ret == -1) + goto out; + } + + LOCK (&defrag->lock); + { + defrag->total_files += 1; + defrag->total_data += iatt.ia_size; + } + UNLOCK (&defrag->lock); + if (defrag->stats == _gf_true) { + gettimeofday (&end, NULL); + elapsed = (end.tv_sec - start.tv_sec) * 1e6 + + (end.tv_usec - start.tv_usec); + gf_log (this->name, GF_LOG_INFO, "Migration of " + "file:%s size:%"PRIu64" bytes took %.2f" + "secs", entry_loc.path, iatt.ia_size, + elapsed/1e6); + } + } + + gf_dirent_free (&entries); + free_entries = _gf_false; + INIT_LIST_HEAD (&entries.list); + + if (readdir_operrno == ENOENT) + break; + } + + gettimeofday (&end, NULL); + elapsed = (end.tv_sec - dir_start.tv_sec) * 1e6 + + (end.tv_usec - dir_start.tv_usec); + gf_log (this->name, GF_LOG_INFO, "Migration operation on dir %s took " + "%.2f secs", loc->path, elapsed/1e6); + ret = 0; +out: + if (free_entries) + gf_dirent_free (&entries); + + loc_wipe (&entry_loc); + + if (dict) + dict_unref(dict); + + if (fd) + fd_unref (fd); + return ret; + +} + + +int +gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, + dict_t *fix_layout, dict_t *migrate_data) +{ + int ret = -1; + loc_t entry_loc = {0,}; + fd_t *fd = NULL; + gf_dirent_t entries; + gf_dirent_t *tmp = NULL; + gf_dirent_t *entry = NULL; + gf_boolean_t free_entries = _gf_false; + dict_t *dict = NULL; + off_t offset = 0; + struct iatt iatt = {0,}; + int readdirp_errno = 0; + + ret = syncop_lookup (this, loc, NULL, &iatt, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Lookup failed on %s", + loc->path); + goto out; + } + + if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) { + ret = gf_defrag_migrate_data (this, defrag, loc, migrate_data); + if (ret) + goto out; + } + + gf_log (this->name, GF_LOG_TRACE, "fix layout called on %s", loc->path); + + fd = fd_create (loc->inode, defrag->pid); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, "Failed to create fd"); + ret = -1; + goto out; + } + + ret = syncop_opendir (this, loc, fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to open dir %s", + loc->path); + ret = -1; + goto out; + } + + INIT_LIST_HEAD (&entries.list); + while ((ret = syncop_readdirp (this, fd, 131072, offset, NULL, + &entries)) != 0) + { + + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Readdir returned %s" + ". Aborting fix-layout",strerror(errno)); + goto out; + } + + /* Need to keep track of ENOENT errno, that means, there is no + need to send more readdirp() */ + readdirp_errno = errno; + + if (list_empty (&entries.list)) + break; + + free_entries = _gf_true; + + list_for_each_entry_safe (entry, tmp, &entries.list, list) { + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { + ret = 1; + goto out; + } + + offset = entry->d_off; + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + if (!IA_ISDIR (entry->d_stat.ia_type)) + continue; + + loc_wipe (&entry_loc); + ret =dht_build_child_loc (this, &entry_loc, loc, + entry->d_name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Child loc" + " build failed"); + goto out; + } + + if (uuid_is_null (entry->d_stat.ia_gfid)) { + gf_log (this->name, GF_LOG_ERROR, "%s/%s" + " gfid not present", loc->path, + entry->d_name); + continue; + } + + entry_loc.inode->ia_type = entry->d_stat.ia_type; + + uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid); + if (uuid_is_null (loc->gfid)) { + gf_log (this->name, GF_LOG_ERROR, "%s/%s" + " gfid not present", loc->path, + entry->d_name); + continue; + } + + uuid_copy (entry_loc.pargfid, loc->gfid); + + ret = syncop_lookup (this, &entry_loc, NULL, &iatt, + NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s" + " lookup failed", entry_loc.path); + continue; + } + + ret = syncop_setxattr (this, &entry_loc, fix_layout, + 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Setxattr " + "failed for %s", entry_loc.path); + defrag->defrag_status = + GF_DEFRAG_STATUS_FAILED; + defrag->total_failures ++; + goto out; + } + ret = gf_defrag_fix_layout (this, defrag, &entry_loc, + fix_layout, migrate_data); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Fix layout " + "failed for %s", entry_loc.path); + defrag->total_failures++; + goto out; + } + + } + gf_dirent_free (&entries); + free_entries = _gf_false; + INIT_LIST_HEAD (&entries.list); + if (readdirp_errno == ENOENT) + break; + } + + ret = 0; +out: + if (free_entries) + gf_dirent_free (&entries); + + loc_wipe (&entry_loc); + + if (dict) + dict_unref(dict); + + if (fd) + fd_unref (fd); + + return ret; + +} + + +int +gf_defrag_start_crawl (void *data) +{ + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + int ret = -1; + loc_t loc = {0,}; + struct iatt iatt = {0,}; + struct iatt parent = {0,}; + dict_t *fix_layout = NULL; + dict_t *migrate_data = NULL; + dict_t *status = NULL; + glusterfs_ctx_t *ctx = NULL; + + this = data; + if (!this) + goto out; + + ctx = this->ctx; + if (!ctx) + goto out; + + conf = this->private; + if (!conf) + goto out; + + defrag = conf->defrag; + if (!defrag) + goto out; + + gettimeofday (&defrag->start_time, NULL); + dht_build_root_inode (this, &defrag->root_inode); + if (!defrag->root_inode) + goto out; + + dht_build_root_loc (defrag->root_inode, &loc); + + /* fix-layout on '/' first */ + + ret = syncop_lookup (this, &loc, NULL, &iatt, NULL, &parent); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "look up on / failed"); + goto out; + } + + fix_layout = dict_new (); + if (!fix_layout) { + ret = -1; + goto out; + } + + ret = dict_set_str (fix_layout, GF_XATTR_FIX_LAYOUT_KEY, "yes"); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set dict str"); + goto out; + } + + ret = syncop_setxattr (this, &loc, fix_layout, 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "fix layout on %s failed", + loc.path); + defrag->total_failures++; + goto out; + } + + if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) { + migrate_data = dict_new (); + if (!migrate_data) { + ret = -1; + goto out; + } + if (defrag->cmd == GF_DEFRAG_CMD_START_FORCE) + ret = dict_set_str (migrate_data, + "distribute.migrate-data", "force"); + else + ret = dict_set_str (migrate_data, + "distribute.migrate-data", + "non-force"); + if (ret) + goto out; + } + ret = gf_defrag_fix_layout (this, defrag, &loc, fix_layout, + migrate_data); + if ((defrag->defrag_status != GF_DEFRAG_STATUS_STOPPED) && + (defrag->defrag_status != GF_DEFRAG_STATUS_FAILED)) { + defrag->defrag_status = GF_DEFRAG_STATUS_COMPLETE; + } + + + +out: + LOCK (&defrag->lock); + { + status = dict_new (); + gf_defrag_status_get (defrag, status); + if (ctx->notify) + ctx->notify (GF_EN_DEFRAG_STATUS, status); + if (status) + dict_unref (status); + defrag->is_exiting = 1; + } + UNLOCK (&defrag->lock); + + if (defrag) { + GF_FREE (defrag); + conf->defrag = NULL; + } + + return ret; +} + + +static int +gf_defrag_done (int ret, call_frame_t *sync_frame, void *data) +{ + gf_listener_stop (sync_frame->this); + + STACK_DESTROY (sync_frame->root); + kill (getpid(), SIGTERM); + return 0; +} + +void * +gf_defrag_start (void *data) +{ + int ret = -1; + call_frame_t *frame = NULL; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + xlator_t *this = NULL; + + this = data; + conf = this->private; + if (!conf) + goto out; + + defrag = conf->defrag; + if (!defrag) + goto out; + + frame = create_frame (this, this->ctx->pool); + if (!frame) + goto out; + + frame->root->pid = GF_CLIENT_PID_DEFRAG; + + defrag->pid = frame->root->pid; + + defrag->defrag_status = GF_DEFRAG_STATUS_STARTED; + + ret = synctask_new (this->ctx->env, gf_defrag_start_crawl, + gf_defrag_done, frame, this); + + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Could not create" + " task for rebalance"); +out: + return NULL; +} + +int +gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict) +{ + int ret = 0; + uint64_t files = 0; + uint64_t size = 0; + uint64_t lookup = 0; + uint64_t failures = 0; + uint64_t skipped = 0; + char *status = ""; + double elapsed = 0; + struct timeval end = {0,}; + + + if (!defrag) + goto out; + + ret = 0; + if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) + goto out; + + files = defrag->total_files; + size = defrag->total_data; + lookup = defrag->num_files_lookedup; + failures = defrag->total_failures; + skipped = defrag->skipped; + + gettimeofday (&end, NULL); + + elapsed = end.tv_sec - defrag->start_time.tv_sec; + + if (!dict) + goto log; + + ret = dict_set_uint64 (dict, "files", files); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set file count"); + + ret = dict_set_uint64 (dict, "size", size); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set size of xfer"); + + ret = dict_set_uint64 (dict, "lookups", lookup); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set lookedup file count"); + + + ret = dict_set_int32 (dict, "status", defrag->defrag_status); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set status"); + if (elapsed) { + ret = dict_set_double (dict, "run-time", elapsed); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set run-time"); + } + + ret = dict_set_uint64 (dict, "failures", failures); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set failure count"); + + ret = dict_set_uint64 (dict, "skipped", skipped); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set skipped file count"); +log: + switch (defrag->defrag_status) { + case GF_DEFRAG_STATUS_NOT_STARTED: + status = "not started"; + break; + case GF_DEFRAG_STATUS_STARTED: + status = "in progress"; + break; + case GF_DEFRAG_STATUS_STOPPED: + status = "stopped"; + break; + case GF_DEFRAG_STATUS_COMPLETE: + status = "completed"; + break; + case GF_DEFRAG_STATUS_FAILED: + status = "failed"; + break; + default: + break; + } + + gf_log (THIS->name, GF_LOG_INFO, "Rebalance is %s. Time taken is %.2f " + "secs", status, elapsed); + gf_log (THIS->name, GF_LOG_INFO, "Files migrated: %"PRIu64", size: %" + PRIu64", lookups: %"PRIu64", failures: %"PRIu64", skipped: " + "%"PRIu64, files, size, lookup, failures, skipped); + + +out: + return 0; +} + +int +gf_defrag_stop (gf_defrag_info_t *defrag, dict_t *output) +{ + /* TODO: set a variable 'stop_defrag' here, it should be checked + in defrag loop */ + int ret = -1; + GF_ASSERT (defrag); + + if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) { + goto out; + } + + gf_log ("", GF_LOG_INFO, "Received stop command on rebalance"); + defrag->defrag_status = GF_DEFRAG_STATUS_STOPPED; + + if (output) + gf_defrag_status_get (defrag, output); + ret = 0; +out: + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c index 02b12887f..5d6f4f232 100644 --- a/xlators/cluster/dht/src/dht-rename.c +++ b/xlators/cluster/dht/src/dht-rename.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ /* TODO: link(oldpath, newpath) fails if newpath already exists. DHT should @@ -35,7 +26,8 @@ int dht_rename_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *stbuf, struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = 0; @@ -84,7 +76,7 @@ unwind: DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, &local->stbuf, &local->preoldparent, &local->postoldparent, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, xdata); } return 0; @@ -97,7 +89,7 @@ dht_rename_hashed_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *preoldparent, struct iatt *postoldparent, struct iatt *prenewparent, - struct iatt *postnewparent) + struct iatt *postnewparent, dict_t *xdata) { dht_conf_t *conf = NULL; dht_local_t *local = NULL; @@ -147,7 +139,7 @@ dht_rename_hashed_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STACK_WIND (frame, dht_rename_dir_cbk, conf->subvolumes[i], conf->subvolumes[i]->fops->rename, - &local->loc, &local->loc2); + &local->loc, &local->loc2, NULL); if (!--call_cnt) break; } @@ -164,7 +156,7 @@ unwind: DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, &local->stbuf, &local->preoldparent, &local->postoldparent, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, NULL); return 0; } @@ -185,19 +177,20 @@ dht_rename_dir_do (call_frame_t *frame, xlator_t *this) STACK_WIND (frame, dht_rename_hashed_dir_cbk, local->dst_hashed, local->dst_hashed->fops->rename, - &local->loc, &local->loc2); + &local->loc, &local->loc2, NULL); return 0; err: DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, NULL, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); return 0; } int dht_rename_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *entries) + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = -1; @@ -226,7 +219,7 @@ dht_rename_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int dht_rename_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd) + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) { dht_local_t *local = NULL; int this_call_cnt = -1; @@ -246,7 +239,7 @@ dht_rename_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STACK_WIND (frame, dht_rename_readdir_cbk, prev->this, prev->this->fops->readdir, - local->fd, 4096, 0); + local->fd, 4096, 0, NULL); return 0; @@ -302,22 +295,54 @@ dht_rename_dir (call_frame_t *frame, xlator_t *this) STACK_WIND (frame, dht_rename_opendir_cbk, conf->subvolumes[i], conf->subvolumes[i]->fops->opendir, - &local->loc2, local->fd); + &local->loc2, local->fd, NULL); } return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); return 0; } +#define DHT_MARK_FOP_INTERNAL(xattr) do { \ + int tmp = -1; \ + if (!xattr) { \ + xattr = dict_new (); \ + if (!xattr) \ + break; \ + } \ + tmp = dict_set_str (xattr, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); \ + if (tmp) { \ + gf_log (this->name, GF_LOG_ERROR, "Failed to set" \ + " internal dict key for %s", local->loc.path); \ + } \ + }while (0) +int +dht_rename_done (call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + local = frame->local; + + if (local->linked == _gf_true) { + local->linked = _gf_false; + dht_linkfile_attr_heal (frame, this); + } + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); + DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, + &local->stbuf, &local->preoldparent, + &local->postoldparent, &local->preparent, + &local->postparent, NULL); + + return 0; +} int dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -346,11 +371,7 @@ dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, WIPE (&local->postparent); if (is_last_call (this_call_cnt)) { - DHT_STRIP_PHASE1_FLAGS (&local->stbuf); - DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, - &local->stbuf, &local->preoldparent, - &local->postoldparent, &local->preparent, - &local->postparent); + dht_rename_done (frame, this); } out: @@ -368,7 +389,7 @@ dht_rename_cleanup (call_frame_t *frame) xlator_t *dst_hashed = NULL; xlator_t *dst_cached = NULL; int call_cnt = 0; - + dict_t *xattr = NULL; local = frame->local; this = frame->this; @@ -392,13 +413,15 @@ dht_rename_cleanup (call_frame_t *frame) if (!call_cnt) goto nolinks; + DHT_MARK_FOP_INTERNAL (xattr); + if (dst_hashed != src_hashed && dst_hashed != src_cached) { gf_log (this->name, GF_LOG_TRACE, "unlinking linkfile %s @ %s => %s", local->loc.path, dst_hashed->name, src_cached->name); STACK_WIND (frame, dht_rename_unlink_cbk, dst_hashed, dst_hashed->fops->unlink, - &local->loc); + &local->loc, 0, xattr); } if (src_cached != dst_hashed) { @@ -407,9 +430,12 @@ dht_rename_cleanup (call_frame_t *frame) local->loc2.path, src_cached->name); STACK_WIND (frame, dht_rename_unlink_cbk, src_cached, src_cached->fops->unlink, - &local->loc2); + &local->loc2, 0, xattr); } + if (xattr) + dict_unref (xattr); + return 0; nolinks: @@ -422,7 +448,7 @@ nolinks: DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, &local->stbuf, &local->preoldparent, &local->postoldparent, &local->preparent, - &local->postparent); + &local->postparent, NULL); return 0; } @@ -430,9 +456,10 @@ nolinks: int dht_rename_links_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { call_frame_t *prev = NULL; dht_local_t *local = NULL; @@ -446,6 +473,10 @@ dht_rename_links_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->loc.path, prev->this->name, strerror (op_errno)); } + if (local->linked == _gf_true) { + local->linked = _gf_false; + dht_linkfile_attr_heal (frame, this); + } DHT_STACK_DESTROY (frame); return 0; @@ -456,7 +487,8 @@ int dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *stbuf, struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -467,6 +499,7 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, xlator_t *rename_subvol = NULL; call_frame_t *link_frame = NULL; dht_local_t *link_local = NULL; + dict_t *xattr = NULL; local = frame->local; prev = cookie; @@ -476,6 +509,8 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dst_hashed = local->dst_hashed; dst_cached = local->dst_cached; + if (local->linked == _gf_true) + FRAME_SU_UNDO (frame, dht_local_t); if (op_ret == -1) { gf_log (this->name, GF_LOG_WARNING, "%s: rename on %s failed (%s)", local->loc.path, @@ -505,15 +540,25 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, uuid_copy (link_local->gfid, local->loc.inode->gfid); dht_linkfile_create (link_frame, dht_rename_links_create_cbk, - src_cached, dst_hashed, &link_local->loc); + this, src_cached, dst_hashed, + &link_local->loc); } err: - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - dht_iatt_merge (this, &local->preoldparent, preoldparent, prev->this); - dht_iatt_merge (this, &local->postoldparent, postoldparent, prev->this); - dht_iatt_merge (this, &local->preparent, prenewparent, prev->this); - dht_iatt_merge (this, &local->postparent, postnewparent, prev->this); + /* Merge attrs only from src_cached. In case there of src_cached != + * dst_hashed, this ignores linkfile attrs. */ + if (prev->this == src_cached) { + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); + dht_iatt_merge (this, &local->preoldparent, preoldparent, + prev->this); + dht_iatt_merge (this, &local->postoldparent, postoldparent, + prev->this); + dht_iatt_merge (this, &local->preparent, prenewparent, + prev->this); + dht_iatt_merge (this, &local->postparent, postnewparent, + prev->this); + } + /* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk * is called. since rename has already happened on rename_subvol, @@ -538,6 +583,8 @@ err: if (local->call_cnt == 0) goto unwind; + DHT_MARK_FOP_INTERNAL (xattr); + if (src_cached != dst_hashed && src_cached != dst_cached) { gf_log (this->name, GF_LOG_TRACE, "deleting old src datafile %s @ %s", @@ -545,7 +592,7 @@ err: STACK_WIND (frame, dht_rename_unlink_cbk, src_cached, src_cached->fops->unlink, - &local->loc); + &local->loc, 0, xattr); } if (src_hashed != rename_subvol && src_hashed != src_cached) { @@ -555,7 +602,7 @@ err: STACK_WIND (frame, dht_rename_unlink_cbk, src_hashed, src_hashed->fops->unlink, - &local->loc); + &local->loc, 0, xattr); } if (dst_cached @@ -567,8 +614,10 @@ err: STACK_WIND (frame, dht_rename_unlink_cbk, dst_cached, dst_cached->fops->unlink, - &local->loc2); + &local->loc2, 0, xattr); } + if (xattr) + dict_unref (xattr); return 0; unwind: @@ -576,16 +625,16 @@ unwind: WIPE (&local->postoldparent); WIPE (&local->preparent); WIPE (&local->postparent); + if (xattr) + dict_unref (xattr); - DHT_STRIP_PHASE1_FLAGS (&local->stbuf); - DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, - &local->stbuf, &local->preoldparent, - &local->postoldparent, &local->preparent, - &local->postparent); + dht_rename_done (frame, this); return 0; cleanup: + if (xattr) + dict_unref (xattr); dht_rename_cleanup (frame); return 0; @@ -619,9 +668,11 @@ dht_do_rename (call_frame_t *frame) "renaming %s => %s (%s)", local->loc.path, local->loc2.path, rename_subvol->name); + if (local->linked == _gf_true) + FRAME_SU_DO (frame, dht_local_t); STACK_WIND (frame, dht_rename_cbk, rename_subvol, rename_subvol->fops->rename, - &local->loc, &local->loc2); + &local->loc, &local->loc2, NULL); return 0; } @@ -631,7 +682,8 @@ int dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -646,7 +698,11 @@ dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "link/file on %s failed (%s)", prev->this->name, strerror (op_errno)); local->op_ret = -1; - local->op_errno = op_errno; + if (op_errno != ENOENT) + local->op_errno = op_errno; + } else if (local->src_cached == prev->this) { + /* merge of attr returned only from linkfile creation */ + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); } this_call_cnt = dht_frame_return (frame); @@ -669,7 +725,8 @@ cleanup: int dht_rename_unlink_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -711,6 +768,7 @@ dht_rename_create_links (call_frame_t *frame) xlator_t *dst_hashed = NULL; xlator_t *dst_cached = NULL; int call_cnt = 0; + dict_t *xattr = NULL; local = frame->local; @@ -721,6 +779,7 @@ dht_rename_create_links (call_frame_t *frame) dst_hashed = local->dst_hashed; dst_cached = local->dst_cached; + DHT_MARK_FOP_INTERNAL (xattr); if (src_cached == dst_cached) { if (dst_hashed == dst_cached) @@ -732,7 +791,7 @@ dht_rename_create_links (call_frame_t *frame) STACK_WIND (frame, dht_rename_unlink_links_cbk, dst_hashed, dst_hashed->fops->unlink, - &local->loc2); + &local->loc2, 0, xattr); return 0; } @@ -749,7 +808,7 @@ dht_rename_create_links (call_frame_t *frame) "linkfile %s @ %s => %s", local->loc.path, dst_hashed->name, src_cached->name); memcpy (local->gfid, local->loc.inode->gfid, 16); - dht_linkfile_create (frame, dht_rename_links_cbk, + dht_linkfile_create (frame, dht_rename_links_cbk, this, src_cached, dst_hashed, &local->loc); } @@ -759,7 +818,7 @@ dht_rename_create_links (call_frame_t *frame) local->loc2.path, src_cached->name); STACK_WIND (frame, dht_rename_links_cbk, src_cached, src_cached->fops->link, - &local->loc, &local->loc2); + &local->loc, &local->loc2, xattr); } nolinks: @@ -767,6 +826,8 @@ nolinks: /* skip to next step */ dht_do_rename (frame); } + if (xattr) + dict_unref (xattr); return 0; } @@ -774,7 +835,7 @@ nolinks: int dht_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) + loc_t *oldloc, loc_t *newloc, dict_t *xdata) { xlator_t *src_cached = NULL; xlator_t *src_hashed = NULL; @@ -856,7 +917,8 @@ dht_rename (call_frame_t *frame, xlator_t *this, err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); return 0; } diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index 1c881be39..3fe96b1c7 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -26,7 +17,7 @@ #include "glusterfs.h" #include "xlator.h" #include "dht-common.h" - +#include "glusterfs-acl.h" #define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,cnt,path) do { \ layout->list[i].start = srt; \ @@ -38,43 +29,40 @@ layout->list[i].xlator->name, path); \ } while (0) +#define DHT_RESET_LAYOUT_RANGE(layout) do { \ + int cnt = 0; \ + for (cnt = 0; cnt < layout->cnt; cnt++ ) { \ + layout->list[cnt].start = 0; \ + layout->list[cnt].stop = 0; \ + } \ + } while (0) -static inline uint32_t -dht_find_overlap (int idx, int cnk_idx, uint32_t start, uint32_t stop, - uint32_t chunk_size) +static uint32_t +dht_overlap_calc (dht_layout_t *old, int o, dht_layout_t *new, int n) { - uint32_t overlap = 0; - uint32_t chunk_begin = 0; + if (o >= old->cnt || n >= new->cnt) + return 0; - chunk_begin = cnk_idx * chunk_size; + if (old->list[o].err > 0 || new->list[n].err > 0) + return 0; - /* There is no chance of overlap */ - if ((chunk_begin > stop) || - ((chunk_begin + chunk_size) < start)) - goto out; - - if ((chunk_begin <= start) && - ((chunk_begin + chunk_size) <= stop)) { - overlap = ((chunk_begin + chunk_size) - start); - goto out; + if (old->list[o].start == old->list[o].stop) { + return 0; } - if ((chunk_begin <= start) && - ((chunk_begin + chunk_size) >= stop)) { - overlap = (stop - start); - goto out; + if (new->list[n].start == new->list[n].stop) { + return 0; } - if ((chunk_begin < stop) && - ((chunk_begin + chunk_size) >= stop)) { - overlap = (stop - chunk_begin); - goto out; - } + if ((old->list[o].start > new->list[n].stop) || + (old->list[o].stop < new->list[n].start)) + return 0; -out: - return overlap; + return min (old->list[o].stop, new->list[n].stop) - + max (old->list[o].start, new->list[n].start) + 1; } + int dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret) { @@ -82,7 +70,7 @@ dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret) local = frame->local; local->selfheal.dir_cbk (frame, NULL, frame->this, ret, - local->op_errno); + local->op_errno, NULL); return 0; } @@ -90,7 +78,7 @@ dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret) int dht_selfheal_dir_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno) + int op_ret, int op_errno, dict_t *xdata) { dht_local_t *local = NULL; call_frame_t *prev = NULL; @@ -129,18 +117,32 @@ dht_selfheal_dir_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, - dht_layout_t *layout, int i) + dht_layout_t *layout, int i, + xlator_t *req_subvol) { xlator_t *subvol = NULL; dict_t *xattr = NULL; int ret = 0; xlator_t *this = NULL; int32_t *disk_layout = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; - - subvol = layout->list[i].xlator; + local = frame->local; + if (req_subvol) + subvol = req_subvol; + else + subvol = layout->list[i].xlator; this = frame->this; + GF_VALIDATE_OR_GOTO ("", this, err); + GF_VALIDATE_OR_GOTO (this->name, layout, err); + GF_VALIDATE_OR_GOTO (this->name, local, err); + GF_VALIDATE_OR_GOTO (this->name, subvol, err); + VALIDATE_OR_GOTO (this->private, err); + + conf = this->private; + xattr = get_new_dict (); if (!xattr) { goto err; @@ -154,8 +156,7 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, goto err; } - ret = dict_set_bin (xattr, "trusted.glusterfs.dht", - disk_layout, 4 * 4); + ret = dict_set_bin (xattr, conf->xattr_name, disk_layout, 4 * 4); if (ret == -1) { gf_log (this->name, GF_LOG_WARNING, "%s: (subvol %s) failed to set xattr dictionary", @@ -171,9 +172,12 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, dict_ref (xattr); + if (!uuid_is_null (local->gfid)) + uuid_copy (loc->gfid, local->gfid); + STACK_WIND (frame, dht_selfheal_dir_xattr_cbk, subvol, subvol->fops->setxattr, - loc, xattr, 0); + loc, xattr, 0, NULL); dict_unref (xattr); @@ -183,11 +187,10 @@ err: if (xattr) dict_destroy (xattr); - if (disk_layout) - GF_FREE (disk_layout); + GF_FREE (disk_layout); dht_selfheal_dir_xattr_cbk (frame, subvol, frame->this, - -1, ENOMEM); + -1, ENOMEM, NULL); return 0; } @@ -198,21 +201,42 @@ dht_fix_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) int i = 0; int count = 0; xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *dummy = NULL; local = frame->local; this = frame->this; + conf = this->private; gf_log (this->name, GF_LOG_DEBUG, "writing the new range for all subvolumes"); - local->call_cnt = count = layout->cnt; + local->call_cnt = count = conf->subvolume_cnt; for (i = 0; i < layout->cnt; i++) { - dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i); + dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i, NULL); if (--count == 0) - break; + goto out; } + /* if we are here, subvolcount > layout_count. subvols-per-directory + * option might be set here. We need to clear out layout from the + * non-participating subvolumes, else it will result in overlaps */ + dummy = dht_layout_new (this, 1); + if (!dummy) + goto out; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (_gf_false == + dht_is_subvol_in_layout (layout, conf->subvolumes[i])) { + dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0, + conf->subvolumes[i]); + if (--count == 0) + break; + } + } + + dht_layout_unref (this, dummy); +out: return 0; } @@ -223,14 +247,17 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) int missing_xattr = 0; int i = 0; xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *dummy = NULL; local = frame->local; this = frame->this; + conf = this->private; for (i = 0; i < layout->cnt; i++) { if (layout->list[i].err != -1 || !layout->list[i].stop) { /* err != -1 would mean xattr present on the directory - * or the directory is itself non existant. + * or the directory is non existent. * !layout->list[i].stop would mean layout absent */ @@ -254,18 +281,30 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) if (layout->list[i].err != -1 || !layout->list[i].stop) continue; - dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i); + dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i, NULL); if (--missing_xattr == 0) break; } + dummy = dht_layout_new (this, 1); + if (!dummy) + goto out; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (_gf_false == + dht_is_subvol_in_layout (layout, conf->subvolumes[i])) { + dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0, + conf->subvolumes[i]); + } + } + dht_layout_unref (this, dummy); +out: return 0; } int dht_selfheal_dir_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *statpre, - struct iatt *statpost) + struct iatt *statpost, dict_t *xdata) { dht_local_t *local = NULL; dht_layout_t *layout = NULL; @@ -306,6 +345,9 @@ dht_selfheal_dir_setattr (call_frame_t *frame, loc_t *loc, struct iatt *stbuf, return 0; } + if (!uuid_is_null (local->gfid)) + uuid_copy (loc->gfid, local->gfid); + local->call_cnt = missing_attr; for (i = 0; i < layout->cnt; i++) { if (layout->list[i].err == -1) { @@ -316,7 +358,7 @@ dht_selfheal_dir_setattr (call_frame_t *frame, loc_t *loc, struct iatt *stbuf, STACK_WIND (frame, dht_selfheal_dir_setattr_cbk, layout->list[i].xlator, layout->list[i].xlator->fops->setattr, - loc, stbuf, valid); + loc, stbuf, valid, NULL); } } @@ -327,7 +369,8 @@ int dht_selfheal_dir_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent) + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { dht_local_t *local = NULL; dht_layout_t *layout = NULL; @@ -373,6 +416,46 @@ out: return 0; } +void +dht_selfheal_dir_mkdir_setacl (dict_t *xattr, dict_t *dict) +{ + data_t *acl_default = NULL; + data_t *acl_access = NULL; + xlator_t *this = NULL; + int ret = -1; + + GF_ASSERT (xattr); + GF_ASSERT (dict); + + this = THIS; + GF_ASSERT (this); + + acl_default = dict_get (xattr, POSIX_ACL_DEFAULT_XATTR); + + if (!acl_default) { + gf_log (this->name, GF_LOG_DEBUG, + "ACL_DEFAULT xattr not present"); + goto cont; + } + ret = dict_set (dict, POSIX_ACL_DEFAULT_XATTR, acl_default); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "Could not set ACL_DEFAULT xattr"); +cont: + acl_access = dict_get (xattr, POSIX_ACL_ACCESS_XATTR); + if (!acl_access) { + gf_log (this->name, GF_LOG_DEBUG, + "ACL_ACCESS xattr not present"); + goto out; + } + ret = dict_set (dict, POSIX_ACL_ACCESS_XATTR, acl_access); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "Could not set ACL_ACCESS xattr"); + +out: + return; +} int dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc, @@ -406,16 +489,19 @@ dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc, ret = dict_set_static_bin (dict, "gfid-req", local->gfid, 16); if (ret) - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_WARNING, "%s: failed to set gfid in dict", loc->path); } else if (local->params) { /* Send the dictionary from higher layers directly */ dict = dict_ref (local->params); } + /* Set acls */ + if (local->xattr && dict) + dht_selfheal_dir_mkdir_setacl (local->xattr, dict); if (!dict) gf_log (this->name, GF_LOG_WARNING, - "dict is NULL, need to make sure gfid's are same"); + "dict is NULL, need to make sure gfids are same"); for (i = 0; i < layout->cnt; i++) { if (layout->list[i].err == ENOENT || force) { @@ -429,7 +515,7 @@ dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc, loc, st_mode_from_ia (local->stbuf.ia_prot, local->stbuf.ia_type), - dict); + 0, dict); } } @@ -448,7 +534,7 @@ dht_selfheal_layout_alloc_start (xlator_t *this, loc_t *loc, uint32_t hashval = 0; int ret = 0; - ret = dht_hash_compute (layout->type, loc->path, &hashval); + ret = dht_hash_compute (this, layout->type, loc->path, &hashval); if (ret == 0) { start = (hashval % layout->cnt); } @@ -471,7 +557,7 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout) for (j = 0; j < conf->subvolume_cnt; j++) { if (conf->decommissioned_bricks[j] && conf->decommissioned_bricks[j] == layout->list[i].xlator) { - layout->list[i].err = -EINVAL; + layout->list[i].err = EINVAL; break; } } @@ -479,9 +565,33 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout) for (i = 0; i < layout->cnt; i++) { err = layout->list[i].err; - if (err == -1 || err == 0) { - layout->list[i].err = -1; + if (err == -1 || err == 0 || err == ENOENT) { + /* Setting list[i].err = -1 is an indication for + dht_selfheal_layout_new_directory() to assign + a range. We set it to -1 based on any one of + the three criteria: + + - err == -1 already, which means directory + existed but layout was not set on it. + + - err == 0, which means directory exists and + has an old layout piece which will be + overwritten now. + + - err == ENOENT, which means directory does + not exist (possibly racing with mkdir or + finishing half done mkdir). The missing + directory will be attempted to be recreated. + + It is important to note that it is safe + to race with mkdir() as self-heal and + mkdir are idempotent operations. Both will + strive to set the directory and layouts to + the same final state. + */ count++; + if (!err) + layout->list[i].err = -1; } } @@ -496,49 +606,126 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout) } } - count = ((layout->spread_cnt) ? layout->spread_cnt : - ((count) ? count : 1)); + /* if layout->spread_cnt is set, check if it is <= available + * subvolumes (down brick and decommissioned bricks are considered + * un-availbale). Else return count (available up bricks) */ + count = ((layout->spread_cnt && + (layout->spread_cnt <= count)) ? + layout->spread_cnt : ((count) ? count : 1)); return count; } +void dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, + dht_layout_t *new_layout); + +void dht_layout_entry_swap (dht_layout_t *layout, int i, int j); +void dht_layout_range_swap (dht_layout_t *layout, int i, int j); + +/* + * It's a bit icky using local variables in a macro, but it makes the rest + * of the code a lot clearer. + */ +#define OV_ENTRY(x,y) table[x*new->cnt+y] + +void +dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc, + dht_layout_t *new, dht_layout_t *old) +{ + int i = 0; + int j = 0; + uint32_t curr_overlap = 0; + uint32_t max_overlap = 0; + int max_overlap_idx = -1; + uint32_t overlap = 0; + uint32_t *table = NULL; + + dht_layout_sort_volname (old); + /* Now both old_layout->list[] and new_layout->list[] + are match the same xlators/subvolumes. i.e, + old_layout->[i] and new_layout->[i] are referring + to the same subvolumes + */ + + /* Build a table of overlaps between new[i] and old[j]. */ + table = alloca(sizeof(overlap)*old->cnt*new->cnt); + if (!table) { + return; + } + memset(table,0,sizeof(overlap)*old->cnt*new->cnt); + for (i = 0; i < new->cnt; ++i) { + for (j = 0; j < old->cnt; ++j) { + OV_ENTRY(i,j) = dht_overlap_calc(old,j,new,i); + } + } + + for (i = 0; i < new->cnt; i++) { + if (new->list[i].err > 0) { + /* Subvol might be marked for decommission + with EINVAL, or some other serious error + marked with positive errno. + */ + continue; + } + + max_overlap = 0; + max_overlap_idx = i; + for (j = (i + 1); j < new->cnt; ++j) { + if (new->list[j].err > 0) { + /* Subvol might be marked for decommission + with EINVAL, or some other serious error + marked with positive errno. + */ + continue; + } + /* Calculate the overlap now. */ + curr_overlap = OV_ENTRY(i,i) + OV_ENTRY(j,j); + /* Calculate the overlap after the proposed swap. */ + overlap = OV_ENTRY(i,j) + OV_ENTRY(j,i); + /* Are we better than status quo? */ + if (overlap > curr_overlap) { + overlap -= curr_overlap; + /* Are we better than the previous choice? */ + if (overlap > max_overlap) { + max_overlap = overlap; + max_overlap_idx = j; + } + } + } + + if (max_overlap_idx != i) { + dht_layout_range_swap (new, i, max_overlap_idx); + /* Need to swap the table values too. */ + for (j = 0; j < old->cnt; ++j) { + overlap = OV_ENTRY(i,j); + OV_ENTRY(i,j) = OV_ENTRY(max_overlap_idx,j); + OV_ENTRY(max_overlap_idx,j) = overlap; + } + } + } +} + + dht_layout_t * dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) { - uint32_t chunk = 0; - uint32_t start = 0; - uint32_t stop = 0; - uint32_t overlap = 0; - uint32_t max_overlap = 0; - uint32_t chunk_begin = 0; - int count = 0; - int cnt = 0; int i = 0; - int j = 0; - int k = 0; - int loop_cnt = 0; - int start_subvol = 0; - int *fix_array = NULL; xlator_t *this = NULL; dht_layout_t *new_layout = NULL; dht_conf_t *priv = NULL; dht_local_t *local = NULL; + uint32_t subvol_down = 0; + int ret = 0; this = frame->this; priv = this->private; local = frame->local; - count = cnt = dht_get_layout_count (this, layout, 0); - - chunk = ((unsigned long) 0xffffffff) / ((cnt) ? cnt : 1); - - start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout); - - fix_array = GF_CALLOC (sizeof (int), layout->cnt, gf_common_mt_char); - if (!fix_array) { - /* No fix, use the existing layout itself */ + if (layout->type == DHT_HASH_TYPE_DM_USER) { + gf_log (THIS->name, GF_LOG_DEBUG, "leaving %s alone", + loc->path); goto done; } @@ -546,98 +733,33 @@ dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc, if (!new_layout) goto done; - for (i = 0; i < new_layout->cnt; i++) { - /* TODO: fix this in layout_alloc() itself */ - new_layout->list[i].err = -ENOENT; - if (i < layout->cnt) - new_layout->list[i].xlator = layout->list[i].xlator; - } - - /* Check if there are any overlap in layout, and give the proper fix */ - for (i = 0; i < layout->cnt; i++) { - /* No need to fix if 'err' is not '-1' */ - if (layout->list[i].err != -1) - continue; - - /* If already existing layout is having no range, skip it */ - start = layout->list[i].start; - stop = layout->list[i].stop; - if ((stop - start) == 0) - continue; - - max_overlap = 0; + /* If a subvolume is down, do not re-write the layout. */ + ret = dht_layout_anomalies (this, loc, layout, NULL, NULL, NULL, + &subvol_down, NULL, NULL); - /* 'j' is used as starting point of each chunk */ - for (j = 1; j <= count; j++) { - /* if chunk is already used, don't use it again */ - for (k = 0; k < i; k++) - if (j == fix_array[k]) - break; - if (k < i) - continue; - - overlap = dht_find_overlap (i, (j-1), start, stop, chunk); - if (max_overlap < overlap) { - max_overlap = overlap; - fix_array[i] = j; - } - } - - /* If we have any overlap, then use that itself as new - layout for the subvolume */ - if (fix_array[i]) { - chunk_begin = chunk * (fix_array[i] - 1); - new_layout->list[i].err = -1; - DHT_SET_LAYOUT_RANGE (new_layout, i, chunk_begin, - chunk, cnt, loc->path); - /* make sure to give (max - 1) as 'stop' range, - if it is last chunk */ - if (fix_array[i] == count) - new_layout->list[i].stop = 0xffffffff; - if (--cnt == 0) - goto done; - - } + if (subvol_down || (ret == -1)) { + gf_log (this->name, GF_LOG_WARNING, "%u subvolume(s) are down" + ". Skipping fix layout.", subvol_down); + GF_FREE (new_layout); + return NULL; } - /* Now, look for layouts which are not having any overlaps - and give it a fix */ - for (loop_cnt = 0, i = start_subvol; loop_cnt < new_layout->cnt; - i++, loop_cnt++) { - if (i == new_layout->cnt) - i = 0; - - /* If 'fix_array[i]' is set, the layout is already fixed. */ - if (fix_array[i]) - continue; + for (i = 0; i < new_layout->cnt; i++) { + if (layout->list[i].err != ENOSPC) + new_layout->list[i].err = layout->list[i].err; + else + new_layout->list[i].err = -1; - if (layout->list[i].err != -1) { - new_layout->list[i].err = layout->list[i].err; - continue; - } + new_layout->list[i].xlator = layout->list[i].xlator; + } - for (k = 1; k <= count; k++) { - for (j = 0; j < new_layout->cnt; j++) { - if (k == fix_array[j]) - break; - } - /* Didn't find any of the list begining with 'k' */ - if (j == new_layout->cnt) - break; - } + /* First give it a layout as though it is a new directory. This + ensures rotation to kick in */ + dht_layout_sort_volname (new_layout); + dht_selfheal_layout_new_directory (frame, loc, new_layout); - fix_array[i] = k; - chunk_begin = (k - 1) * chunk; - new_layout->list[i].err = -1; - DHT_SET_LAYOUT_RANGE (new_layout, i, chunk_begin, chunk, cnt, - loc->path); - /* make sure to give (max - 1) as 'stop' range, - if it is last chunk */ - if (k == count) - new_layout->list[i].stop = 0xffffffff; - if (--cnt == 0) - goto done; - } + /* Now selectively re-assign ranges only when it helps */ + dht_selfheal_layout_maximize_overlap (frame, loc, new_layout, layout); done: if (new_layout) { @@ -651,7 +773,7 @@ done: local->layout = new_layout; } - return new_layout; + return local->layout; } @@ -675,9 +797,11 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout); + /* clear out the range, as we are re-computing here */ + DHT_RESET_LAYOUT_RANGE (layout); for (i = start_subvol; i < layout->cnt; i++) { err = layout->list[i].err; - if (err == -1) { + if (err == -1 || err == ENOENT) { DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, cnt, loc->path); if (--cnt == 0) { @@ -690,7 +814,7 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, for (i = 0; i < start_subvol; i++) { err = layout->list[i].err; - if (err == -1) { + if (err == -1 || err == ENOENT) { DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, cnt, loc->path); if (--cnt == 0) { @@ -709,35 +833,17 @@ int dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) { - dht_conf_t *conf = NULL; - xlator_t *this = NULL; dht_local_t *local = NULL; - int missing = -1; - int down = -1; - int holes = -1; + uint32_t holes = 0; int ret = -1; int i = -1; - int overlaps = -1; + uint32_t overlaps = 0; - this = frame->this; - conf = this->private; local = frame->local; - missing = local->selfheal.missing; - down = local->selfheal.down; holes = local->selfheal.hole_cnt; overlaps = local->selfheal.overlaps_cnt; - if ((missing + down) == conf->subvolume_cnt) { - dht_selfheal_layout_new_directory (frame, loc, layout); - ret = 0; - } - - if (holes <= down) { - /* the down subvol might fill up the holes */ - ret = 0; - } - if (holes || overlaps) { dht_selfheal_layout_new_directory (frame, loc, layout); ret = 0; @@ -789,6 +895,9 @@ dht_fix_directory_layout (call_frame_t *frame, /* No layout sorting required here */ tmp_layout = dht_fix_layout_of_directory (frame, &local->loc, layout); + if (!tmp_layout) { + return -1; + } dht_fix_dir_xattr (frame, &local->loc, tmp_layout); return 0; @@ -811,9 +920,8 @@ dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, dht_layout_anomalies (this, loc, layout, &local->selfheal.hole_cnt, &local->selfheal.overlaps_cnt, - &local->selfheal.missing, - &local->selfheal.down, - &local->selfheal.misc); + NULL, &local->selfheal.down, + &local->selfheal.misc, NULL); down = local->selfheal.down; misc = local->selfheal.misc; @@ -822,14 +930,14 @@ dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, local->selfheal.layout = dht_layout_ref (this, layout); if (down) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_WARNING, "%d subvolumes down -- not fixing", down); ret = 0; goto sorry_no_fix; } if (misc) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_WARNING, "%d subvolumes have unrecoverable errors", misc); ret = 0; goto sorry_no_fix; @@ -839,7 +947,7 @@ dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, ret = dht_selfheal_dir_getafix (frame, loc, layout); if (ret == -1) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_WARNING, "not able to form layout for the directory"); goto sorry_no_fix; } @@ -872,3 +980,50 @@ dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, return ret; } + +int +dht_dir_attr_heal (void *data) +{ + call_frame_t *frame = NULL; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + int call_cnt = 0; + int ret = -1; + int i = 0; + + GF_VALIDATE_OR_GOTO ("dht", data, out); + + frame = data; + local = frame->local; + this = frame->this; + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", local, out); + conf = this->private; + GF_VALIDATE_OR_GOTO ("dht", conf, out); + + call_cnt = conf->subvolume_cnt; + + for (i = 0; i < call_cnt; i++) { + subvol = conf->subvolumes[i]; + if (!subvol || (subvol == dht_first_up_subvol (this))) + continue; + ret = syncop_setattr (subvol, &local->loc, &local->stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID), + NULL, NULL); + if (ret) + gf_log ("dht", GF_LOG_ERROR, "Failed to set uid/gid on" + " %s on %s subvol (%s)", local->loc.path, + subvol->name, strerror (errno)); + } +out: + return 0; +} + +int +dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data) +{ + DHT_STACK_DESTROY (sync_frame); + return 0; +} diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c new file mode 100644 index 000000000..70aac7710 --- /dev/null +++ b/xlators/cluster/dht/src/dht-shared.c @@ -0,0 +1,758 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +/* TODO: add NS locking */ + +#include "statedump.h" +#include "dht-common.h" + +/* TODO: + - use volumename in xattr instead of "dht" + - use NS locks + - handle all cases in self heal layout reconstruction + - complete linkfile selfheal +*/ +struct volume_options options[]; + +void +dht_layout_dump (dht_layout_t *layout, const char *prefix) +{ + + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; + + if (!layout) + goto out; + if (!prefix) + goto out; + + gf_proc_dump_build_key(key, prefix, "cnt"); + gf_proc_dump_write(key, "%d", layout->cnt); + gf_proc_dump_build_key(key, prefix, "preset"); + gf_proc_dump_write(key, "%d", layout->preset); + gf_proc_dump_build_key(key, prefix, "gen"); + gf_proc_dump_write(key, "%d", layout->gen); + if (layout->type != IA_INVAL) { + gf_proc_dump_build_key(key, prefix, "inode type"); + gf_proc_dump_write(key, "%d", layout->type); + } + + if (!IA_ISDIR (layout->type)) + goto out; + + for (i = 0; i < layout->cnt; i++) { + gf_proc_dump_build_key(key, prefix,"list[%d].err", i); + gf_proc_dump_write(key, "%d", layout->list[i].err); + gf_proc_dump_build_key(key, prefix,"list[%d].start", i); + gf_proc_dump_write(key, "%u", layout->list[i].start); + gf_proc_dump_build_key(key, prefix,"list[%d].stop", i); + gf_proc_dump_write(key, "%u", layout->list[i].stop); + if (layout->list[i].xlator) { + gf_proc_dump_build_key(key, prefix, + "list[%d].xlator.type", i); + gf_proc_dump_write(key, "%s", + layout->list[i].xlator->type); + gf_proc_dump_build_key(key, prefix, + "list[%d].xlator.name", i); + gf_proc_dump_write(key, "%s", + layout->list[i].xlator->name); + } + } + +out: + return; +} + + +int32_t +dht_priv_dump (xlator_t *this) +{ + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; + dht_conf_t *conf = NULL; + int ret = -1; + + if (!this) + goto out; + + conf = this->private; + if (!conf) + goto out; + + ret = TRY_LOCK(&conf->subvolume_lock); + if (ret != 0) { + return ret; + } + + gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name); + gf_proc_dump_build_key(key_prefix,"xlator.cluster.dht","%s.priv", + this->name); + gf_proc_dump_write("subvol_cnt","%d", conf->subvolume_cnt); + for (i = 0; i < conf->subvolume_cnt; i++) { + sprintf (key, "subvolumes[%d]", i); + gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type, + conf->subvolumes[i]->name); + if (conf->file_layouts && conf->file_layouts[i]){ + sprintf (key, "file_layouts[%d]", i); + dht_layout_dump(conf->file_layouts[i], key); + } + if (conf->dir_layouts && conf->dir_layouts[i]) { + sprintf (key, "dir_layouts[%d]", i); + dht_layout_dump(conf->dir_layouts[i], key); + } + if (conf->subvolume_status) { + + sprintf (key, "subvolume_status[%d]", i); + gf_proc_dump_write(key, "%d", + (int)conf->subvolume_status[i]); + } + + } + + gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed); + gf_proc_dump_write("gen", "%d", conf->gen); + gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk); + gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes); + gf_proc_dump_write("disk_unit", "%c", conf->disk_unit); + gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval); + gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit); + if (conf ->du_stats) { + gf_proc_dump_write("du_stats.avail_percent", "%lf", + conf->du_stats->avail_percent); + gf_proc_dump_write("du_stats.avail_space", "%lu", + conf->du_stats->avail_space); + gf_proc_dump_write("du_stats.avail_inodes", "%lf", + conf->du_stats->avail_inodes); + gf_proc_dump_write("du_stats.log", "%lu", conf->du_stats->log); + } + + if (conf->last_stat_fetch.tv_sec) + gf_proc_dump_write("last_stat_fetch", "%s", + ctime(&conf->last_stat_fetch.tv_sec)); + + UNLOCK(&conf->subvolume_lock); + +out: + return ret; +} + +int32_t +dht_inodectx_dump (xlator_t *this, inode_t *inode) +{ + int ret = -1; + dht_layout_t *layout = NULL; + + if (!this) + goto out; + if (!inode) + goto out; + + ret = dht_inode_ctx_layout_get (inode, this, &layout); + + if ((ret != 0) || !layout) + return ret; + + gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name); + dht_layout_dump(layout, "layout"); + +out: + return ret; +} + +void +dht_fini (xlator_t *this) +{ + int i = 0; + dht_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + + conf = this->private; + this->private = NULL; + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + GF_FREE (conf->file_layouts[i]); + } + GF_FREE (conf->file_layouts); + } + + GF_FREE (conf->subvolumes); + + GF_FREE (conf->subvolume_status); + + GF_FREE (conf); + } +out: + return; +} + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + + ret = xlator_mem_acct_init (this, gf_dht_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + return ret; + } +out: + return ret; +} + + +int +dht_parse_decommissioned_bricks (xlator_t *this, dht_conf_t *conf, + const char *bricks) +{ + int i = 0; + int ret = -1; + char *tmpstr = NULL; + char *dup_brick = NULL; + char *node = NULL; + + if (!conf || !bricks) + goto out; + + dup_brick = gf_strdup (bricks); + node = strtok_r (dup_brick, ",", &tmpstr); + while (node) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!strcmp (conf->subvolumes[i]->name, node)) { + conf->decommissioned_bricks[i] = + conf->subvolumes[i]; + conf->decommission_subvols_cnt++; + gf_log (this->name, GF_LOG_INFO, + "decommissioning subvolume %s", + conf->subvolumes[i]->name); + break; + } + } + if (i == conf->subvolume_cnt) { + /* Wrong node given. */ + goto out; + } + node = strtok_r (NULL, ",", &tmpstr); + } + + ret = 0; + conf->decommission_in_progress = 1; +out: + GF_FREE (dup_brick); + + return ret; +} + + +int +dht_decommissioned_remove (xlator_t *this, dht_conf_t *conf) +{ + int i = 0; + int ret = -1; + + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i]) { + conf->decommissioned_bricks[i] = NULL; + conf->decommission_subvols_cnt--; + } + } + + ret = 0; +out: + + return ret; +} +void +dht_init_regex (xlator_t *this, dict_t *odict, char *name, + regex_t *re, gf_boolean_t *re_valid) +{ + char *temp_str; + + if (dict_get_str (odict, name, &temp_str) != 0) { + if (strcmp(name,"rsync-hash-regex")) { + return; + } + temp_str = "^\\.(.+)\\.[^.]+$"; + } + + if (*re_valid) { + regfree(re); + *re_valid = _gf_false; + } + + if (!strcmp(temp_str,"none")) { + return; + } + + if (regcomp(re,temp_str,REG_EXTENDED) == 0) { + gf_log (this->name, GF_LOG_INFO, + "using regex %s = %s", name, temp_str); + *re_valid = _gf_true; + } + else { + gf_log (this->name, GF_LOG_WARNING, + "compiling regex %s failed", temp_str); + } +} + +int +dht_reconfigure (xlator_t *this, dict_t *options) +{ + dht_conf_t *conf = NULL; + char *temp_str = NULL; + gf_boolean_t search_unhashed; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", options, out); + + conf = this->private; + if (!conf) + return 0; + + if (dict_get_str (options, "lookup-unhashed", &temp_str) == 0) { + /* If option is not "auto", other options _should_ be boolean*/ + if (strcasecmp (temp_str, "auto")) { + if (!gf_string2boolean (temp_str, &search_unhashed)) { + gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" + " lookup-unhashed reconfigured (%s)", + temp_str); + conf->search_unhashed = search_unhashed; + } else { + gf_log(this->name, GF_LOG_ERROR, "Reconfigure:" + " lookup-unhashed should be boolean," + " not (%s), defaulting to (%d)", + temp_str, conf->search_unhashed); + //return -1; + ret = -1; + goto out; + } + } else { + gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" + " lookup-unhashed reconfigured auto "); + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; + } + } + + GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options, + percent_or_size, out); + /* option can be any one of percent or bytes */ + conf->disk_unit = 0; + if (conf->min_free_disk < 100.0) + conf->disk_unit = 'p'; + + GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options, + percent, out); + + GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt, + options, uint32, out); + + GF_OPTION_RECONF ("readdir-optimize", conf->readdir_optimize, options, + bool, out); + if (conf->defrag) { + GF_OPTION_RECONF ("rebalance-stats", conf->defrag->stats, + options, bool, out); + } + + if (dict_get_str (options, "decommissioned-bricks", &temp_str) == 0) { + ret = dht_parse_decommissioned_bricks (this, conf, temp_str); + if (ret == -1) + goto out; + } else { + ret = dht_decommissioned_remove (this, conf); + if (ret == -1) + goto out; + } + + dht_init_regex (this, options, "rsync-hash-regex", + &conf->rsync_regex, &conf->rsync_regex_valid); + dht_init_regex (this, options, "extra-hash-regex", + &conf->extra_regex, &conf->extra_regex_valid); + + ret = 0; +out: + return ret; +} + +static int +gf_defrag_pattern_list_fill (xlator_t *this, gf_defrag_info_t *defrag, char *data) +{ + int ret = -1; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *dup_str = NULL; + char *num = NULL; + char *pattern_str = NULL; + char *pattern = NULL; + gf_defrag_pattern_list_t *temp_list = NULL; + gf_defrag_pattern_list_t *pattern_list = NULL; + + if (!this || !defrag || !data) + goto out; + + /* Get the pattern for pattern list. "pattern:<optional-size>" + * eg: *avi, *pdf:10MB, *:1TB + */ + pattern_str = strtok_r (data, ",", &tmp_str); + while (pattern_str) { + dup_str = gf_strdup (pattern_str); + pattern_list = GF_CALLOC (1, sizeof (gf_defrag_pattern_list_t), + 1); + if (!pattern_list) { + goto out; + } + pattern = strtok_r (dup_str, ":", &tmp_str1); + num = strtok_r (NULL, ":", &tmp_str1); + if (!pattern) + goto out; + if (!num) { + if (gf_string2bytesize(pattern, &pattern_list->size) + == 0) { + pattern = "*"; + } + } else if (gf_string2bytesize (num, &pattern_list->size) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", num); + goto out; + } + memcpy (pattern_list->path_pattern, pattern, strlen (dup_str)); + + if (!defrag->defrag_pattern) + temp_list = NULL; + else + temp_list = defrag->defrag_pattern; + + pattern_list->next = temp_list; + + defrag->defrag_pattern = pattern_list; + pattern_list = NULL; + + GF_FREE (dup_str); + dup_str = NULL; + + pattern_str = strtok_r (NULL, ",", &tmp_str); + } + + ret = 0; +out: + if (ret) + GF_FREE (pattern_list); + GF_FREE (dup_str); + + return ret; +} + +int +dht_init (xlator_t *this) +{ + dht_conf_t *conf = NULL; + char *temp_str = NULL; + int ret = -1; + int i = 0; + gf_defrag_info_t *defrag = NULL; + int cmd = 0; + char *node_uuid = NULL; + + + GF_VALIDATE_OR_GOTO ("dht", this, err); + + if (!this->children) { + gf_log (this->name, GF_LOG_CRITICAL, + "Distribute needs more than one subvolume"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile"); + } + + conf = GF_CALLOC (1, sizeof (*conf), gf_dht_mt_dht_conf_t); + if (!conf) { + goto err; + } + + ret = dict_get_int32 (this->options, "rebalance-cmd", &cmd); + + if (cmd) { + defrag = GF_CALLOC (1, sizeof (gf_defrag_info_t), + gf_defrag_info_mt); + + GF_VALIDATE_OR_GOTO (this->name, defrag, err); + + LOCK_INIT (&defrag->lock); + + defrag->is_exiting = 0; + + conf->defrag = defrag; + + ret = dict_get_str (this->options, "node-uuid", &node_uuid); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "node-uuid not " + "specified"); + goto err; + } + + if (uuid_parse (node_uuid, defrag->node_uuid)) { + gf_log (this->name, GF_LOG_ERROR, "Cannot parse " + "glusterd node uuid"); + goto err; + } + + defrag->cmd = cmd; + + defrag->stats = _gf_false; + } + + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; + if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { + /* If option is not "auto", other options _should_ be boolean */ + if (strcasecmp (temp_str, "auto")) + gf_string2boolean (temp_str, &conf->search_unhashed); + else + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; + } + + GF_OPTION_INIT ("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool, + err); + + GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err); + + GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size, + err); + + GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent, + err); + + conf->dir_spread_cnt = conf->subvolume_cnt; + GF_OPTION_INIT ("directory-layout-spread", conf->dir_spread_cnt, + uint32, err); + + GF_OPTION_INIT ("assert-no-child-down", conf->assert_no_child_down, + bool, err); + + GF_OPTION_INIT ("readdir-optimize", conf->readdir_optimize, bool, err); + + if (defrag) { + GF_OPTION_INIT ("rebalance-stats", defrag->stats, bool, err); + if (dict_get_str (this->options, "rebalance-filter", &temp_str) + == 0) { + if (gf_defrag_pattern_list_fill (this, defrag, temp_str) + == -1) { + gf_log (this->name, GF_LOG_ERROR, "Cannot parse" + " rebalance-filter (%s)", temp_str); + goto err; + } + } + } + + /* option can be any one of percent or bytes */ + conf->disk_unit = 0; + if (conf->min_free_disk < 100) + conf->disk_unit = 'p'; + + ret = dht_init_subvolumes (this, conf); + if (ret == -1) { + goto err; + } + + if (dict_get_str (this->options, "decommissioned-bricks", &temp_str) == 0) { + ret = dht_parse_decommissioned_bricks (this, conf, temp_str); + if (ret == -1) + goto err; + } + + dht_init_regex (this, this->options, "rsync-hash-regex", + &conf->rsync_regex, &conf->rsync_regex_valid); + dht_init_regex (this, this->options, "extra-hash-regex", + &conf->extra_regex, &conf->extra_regex_valid); + + ret = dht_layouts_init (this, conf); + if (ret == -1) { + goto err; + } + + LOCK_INIT (&conf->subvolume_lock); + LOCK_INIT (&conf->layout_lock); + + conf->gen = 1; + + this->local_pool = mem_pool_new (dht_local_t, 512); + if (!this->local_pool) { + gf_log (this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + goto err; + } + + GF_OPTION_INIT ("xattr-name", conf->xattr_name, str, err); + gf_asprintf (&conf->link_xattr_name, "%s.linkto", conf->xattr_name); + gf_asprintf (&conf->wild_xattr_name, "%s*", conf->xattr_name); + if (!conf->link_xattr_name || !conf->wild_xattr_name) { + goto err; + } + + this->private = conf; + + return 0; + +err: + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + GF_FREE (conf->file_layouts[i]); + } + GF_FREE (conf->file_layouts); + } + + GF_FREE (conf->subvolumes); + + GF_FREE (conf->subvolume_status); + + GF_FREE (conf->du_stats); + + GF_FREE (conf->defrag); + + GF_FREE (conf->xattr_name); + GF_FREE (conf->link_xattr_name); + GF_FREE (conf->wild_xattr_name); + + GF_FREE (conf); + } + + return -1; +} + + +struct volume_options options[] = { + { .key = {"lookup-unhashed"}, + .value = {"auto", "yes", "no", "enable", "disable", "1", "0", + "on", "off"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "on", + .description = "This option if set to ON, does a lookup through " + "all the sub-volumes, in case a lookup didn't return any result " + "from the hash subvolume. If set to OFF, it does not do a lookup " + "on the remaining subvolumes." + }, + { .key = {"min-free-disk"}, + .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, + .default_value = "10%", + .description = "Percentage/Size of disk space, after which the " + "process starts balancing out the cluster, and logs will appear " + "in log files", + }, + { .key = {"min-free-inodes"}, + .type = GF_OPTION_TYPE_PERCENT, + .default_value = "5%", + .description = "after system has only N% of inodes, warnings " + "starts to appear in log files", + }, + { .key = {"unhashed-sticky-bit"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, + { .key = {"use-readdirp"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "This option if set to ON, forces the use of " + "readdirp, and hence also displays the stats of the files." + }, + { .key = {"assert-no-child-down"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option if set to ON, in the event of " + "CHILD_DOWN, will call exit." + }, + { .key = {"directory-layout-spread"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .validate = GF_OPT_VALIDATE_MIN, + .description = "Specifies the directory layout spread." + }, + { .key = {"decommissioned-bricks"}, + .type = GF_OPTION_TYPE_ANY, + .description = "This option if set to ON, decommissions " + "the brick, so that no new data is allowed to be created " + "on that brick." + }, + { .key = {"rebalance-cmd"}, + .type = GF_OPTION_TYPE_INT, + }, + { .key = {"node-uuid"}, + .type = GF_OPTION_TYPE_STR, + }, + { .key = {"rebalance-stats"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option if set to ON displays and logs the " + " time taken for migration of each file, during the rebalance " + "process. If set to OFF, the rebalance logs will only display the " + "time spent in each directory." + }, + { .key = {"readdir-optimize"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option if set to ON enables the optimization " + "that allows DHT to requests non-first subvolumes to filter out " + "directory entries." + }, + { .key = {"rsync-hash-regex"}, + .type = GF_OPTION_TYPE_STR, + /* Setting a default here doesn't work. See dht_init_regex. */ + .description = "Regular expression for stripping temporary-file " + "suffix and prefix used by rsync, to prevent relocation when the " + "file is renamed." + }, + { .key = {"extra-hash-regex"}, + .type = GF_OPTION_TYPE_STR, + /* Setting a default here doesn't work. See dht_init_regex. */ + .description = "Regular expression for stripping temporary-file " + "suffix and prefix used by an application, to prevent relocation when " + "the file is renamed." + }, + { .key = {"rebalance-filter"}, + .type = GF_OPTION_TYPE_STR, + }, + + { .key = {"xattr-name"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "trusted.glusterfs.dht", + .description = "Base for extended attributes used by this " + "translator instance, to avoid conflicts with others above or " + "below it." + }, + + /* NUFA option */ + { .key = {"local-volume-name"}, + .type = GF_OPTION_TYPE_XLATOR + }, + + /* switch option */ + { .key = {"pattern.switch.case"}, + .type = GF_OPTION_TYPE_ANY + }, + + { .key = {NULL} }, +}; diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c index 7b32a9766..fc0ca2f77 100644 --- a/xlators/cluster/dht/src/dht.c +++ b/xlators/cluster/dht/src/dht.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -23,427 +14,15 @@ #include "config.h" #endif -/* TODO: add NS locking */ - #include "statedump.h" #include "dht-common.h" -/* TODO: - - use volumename in xattr instead of "dht" - - use NS locks - - handle all cases in self heal layout reconstruction - - complete linkfile selfheal -*/ -struct volume_options options[]; - -void -dht_layout_dump (dht_layout_t *layout, const char *prefix) -{ - - char key[GF_DUMP_MAX_BUF_LEN]; - int i = 0; - - GF_VALIDATE_OR_GOTO ("dht", layout, out); - GF_VALIDATE_OR_GOTO ("dht", prefix, out); - - gf_proc_dump_build_key(key, prefix, "cnt"); - gf_proc_dump_write(key, "%d", layout->cnt); - gf_proc_dump_build_key(key, prefix, "preset"); - gf_proc_dump_write(key, "%d", layout->preset); - gf_proc_dump_build_key(key, prefix, "gen"); - gf_proc_dump_write(key, "%d", layout->gen); - gf_proc_dump_build_key(key, prefix, "type"); - gf_proc_dump_write(key, "%d", layout->type); - - for (i = 0; i < layout->cnt; i++) { - gf_proc_dump_build_key(key, prefix,"list[%d].err", i); - gf_proc_dump_write(key, "%d", layout->list[i].err); - gf_proc_dump_build_key(key, prefix,"list[%d].start", i); - gf_proc_dump_write(key, "%u", layout->list[i].start); - gf_proc_dump_build_key(key, prefix,"list[%d].stop", i); - gf_proc_dump_write(key, "%u", layout->list[i].stop); - if (layout->list[i].xlator) { - gf_proc_dump_build_key(key, prefix, - "list[%d].xlator.type", i); - gf_proc_dump_write(key, "%s", - layout->list[i].xlator->type); - gf_proc_dump_build_key(key, prefix, - "list[%d].xlator.name", i); - gf_proc_dump_write(key, "%s", - layout->list[i].xlator->name); - } - } - -out: - return; -} - - -int32_t -dht_priv_dump (xlator_t *this) -{ - char key_prefix[GF_DUMP_MAX_BUF_LEN]; - char key[GF_DUMP_MAX_BUF_LEN]; - int i = 0; - dht_conf_t *conf = NULL; - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - - conf = this->private; - - if (!conf) - return -1; - - ret = TRY_LOCK(&conf->subvolume_lock); - - if (ret != 0) { - gf_log("", GF_LOG_WARNING, "Unable to lock dht subvolume %s", - this->name); - return ret; - } - - gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name); - gf_proc_dump_build_key(key_prefix,"xlator.cluster.dht","%s.priv", - this->name); - gf_proc_dump_write("subvol_cnt","%d", conf->subvolume_cnt); - for (i = 0; i < conf->subvolume_cnt; i++) { - sprintf (key, "subvolumes[%d]", i); - gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type, - conf->subvolumes[i]->name); - if (conf->file_layouts && conf->file_layouts[i]){ - sprintf (key, "file_layouts[%d]", i); - dht_layout_dump(conf->file_layouts[i], key); - } - if (conf->dir_layouts && conf->dir_layouts[i]) { - sprintf (key, "dir_layouts[%d]", i); - dht_layout_dump(conf->dir_layouts[i], key); - } - if (conf->subvolume_status) { - - sprintf (key, "subvolume_status[%d]", i); - gf_proc_dump_write(key, "%d", - (int)conf->subvolume_status[i]); - } - - } - - gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed); - gf_proc_dump_write("gen", "%d", conf->gen); - gf_proc_dump_write("min_free_disk", "%lu", conf->min_free_disk); - gf_proc_dump_write("disk_unit", "%c", conf->disk_unit); - gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval); - gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit); - if (conf ->du_stats) { - gf_proc_dump_write("du_stats.avail_percent", "%lf", - conf->du_stats->avail_percent); - gf_proc_dump_write("du_stats.avail_space", "%lu", - conf->du_stats->avail_space); - gf_proc_dump_write("du_stats.log", "%lu", conf->du_stats->log); - } - gf_proc_dump_write("last_stat_fetch", "%s", ctime(&conf->last_stat_fetch.tv_sec)); - - UNLOCK(&conf->subvolume_lock); - -out: - return ret; -} - -int32_t -dht_inodectx_dump (xlator_t *this, inode_t *inode) -{ - int ret = -1; - dht_layout_t *layout = NULL; - uint64_t tmp_layout = 0; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", inode, out); - - ret = inode_ctx_get (inode, this, &tmp_layout); - - if (ret != 0) - return ret; - - layout = (dht_layout_t *)(long)tmp_layout; - - if (!layout) - return -1; - - gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name); - dht_layout_dump(layout, "layout"); - -out: - return ret; -} - -int -notify (xlator_t *this, int event, void *data, ...) -{ - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - - ret = dht_notify (this, event, data); - -out: - return ret; -} - -void -fini (xlator_t *this) -{ - int i = 0; - dht_conf_t *conf = NULL; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - - conf = this->private; - this->private = NULL; - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - if (conf->subvolumes) - GF_FREE (conf->subvolumes); - - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); - - GF_FREE (conf); - } -out: - return; -} - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - - ret = xlator_mem_acct_init (this, gf_dht_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } -out: - return ret; -} - - -int -dht_parse_decommissioned_bricks (xlator_t *this, dht_conf_t *conf, - const char *bricks) -{ - int i = 0; - int ret = -1; - char *tmpstr = NULL; - char *dup_brick = NULL; - char *node = NULL; - - if (!conf || !bricks) - goto out; - - dup_brick = gf_strdup (bricks); - node = strtok_r (dup_brick, ",", &tmpstr); - while (node) { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (!strcmp (conf->subvolumes[i]->name, node)) { - conf->decommissioned_bricks[i] = - conf->subvolumes[i]; - gf_log (this->name, GF_LOG_INFO, - "decommissioning subvolume %s", - conf->subvolumes[i]->name); - break; - } - } - if (i == conf->subvolume_cnt) { - /* Wrong node given. */ - goto out; - } - node = strtok_r (NULL, ",", &tmpstr); - } - - ret = 0; -out: - if (dup_brick) - GF_FREE (dup_brick); - - return ret; -} - -int -reconfigure (xlator_t *this, dict_t *options) -{ - dht_conf_t *conf = NULL; - char *temp_str = NULL; - gf_boolean_t search_unhashed; - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", options, out); - - conf = this->private; - if (!conf) - return 0; - - if (dict_get_str (options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean*/ - if (strcasecmp (temp_str, "auto")) { - if (!gf_string2boolean (temp_str, &search_unhashed)) { - gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" - " lookup-unahashed reconfigured (%s)", - temp_str); - conf->search_unhashed = search_unhashed; - } else { - gf_log(this->name, GF_LOG_ERROR, "Reconfigure:" - " lookup-unahashed should be boolean," - " not (%s), defaulting to (%d)", - temp_str, conf->search_unhashed); - //return -1; - ret = -1; - goto out; - } - } else { - gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" - " lookup-unahashed reconfigured auto "); - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; - } - } - - GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options, - percent_or_size, out); - - GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt, - options, uint32, out); - - if (dict_get_str (options, "decommissioned-bricks", &temp_str) == 0) { - ret = dht_parse_decommissioned_bricks (this, conf, temp_str); - if (ret == -1) - goto out; - } - - ret = 0; -out: - return ret; -} - - -int -init (xlator_t *this) -{ - dht_conf_t *conf = NULL; - char *temp_str = NULL; - int ret = -1; - int i = 0; - - GF_VALIDATE_OR_GOTO ("dht", this, err); - - if (!this->children) { - gf_log (this->name, GF_LOG_CRITICAL, - "Distribute needs more than one subvolume"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile"); - } - - conf = GF_CALLOC (1, sizeof (*conf), gf_dht_mt_dht_conf_t); - if (!conf) { - goto err; - } - - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; - if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean */ - if (strcasecmp (temp_str, "auto")) - gf_string2boolean (temp_str, &conf->search_unhashed); - else - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; - } - - GF_OPTION_INIT ("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool, - err); - - GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err); - - GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size, - err); - - conf->dir_spread_cnt = conf->subvolume_cnt; - GF_OPTION_INIT ("directory-layout-spread", conf->dir_spread_cnt, - uint32, err); - - GF_OPTION_INIT ("assert-no-child-down", conf->assert_no_child_down, - bool, err); - - ret = dht_init_subvolumes (this, conf); - if (ret == -1) { - goto err; - } - - if (dict_get_str (this->options, "decommissioned-bricks", &temp_str) == 0) { - ret = dht_parse_decommissioned_bricks (this, conf, temp_str); - if (ret == -1) - goto err; - } - - ret = dht_layouts_init (this, conf); - if (ret == -1) { - goto err; - } - - LOCK_INIT (&conf->subvolume_lock); - LOCK_INIT (&conf->layout_lock); - - conf->gen = 1; - - /* Create 'syncop' environment */ - conf->env = syncenv_new (0); - if (!conf->env) { - gf_log (this->name, GF_LOG_ERROR, - "failed to create sync environment %s", - strerror (errno)); - goto err; - } - - this->private = conf; - - return 0; - -err: - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - if (conf->subvolumes) - GF_FREE (conf->subvolumes); - - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); - - if (conf->du_stats) - GF_FREE (conf->du_stats); - - GF_FREE (conf); - } - - return -1; -} - +class_methods_t class_methods = { + .init = dht_init, + .fini = dht_fini, + .reconfigure = dht_reconfigure, + .notify = dht_notify +}; struct xlator_fops fops = { .lookup = dht_lookup, @@ -471,6 +50,7 @@ struct xlator_fops fops = { .access = dht_access, .readlink = dht_readlink, .getxattr = dht_getxattr, + .fgetxattr = dht_fgetxattr, .readv = dht_readv, .flush = dht_flush, .fsync = dht_fsync, @@ -479,6 +59,7 @@ struct xlator_fops fops = { .lk = dht_lk, /* Inode write operations */ + .fremovexattr = dht_fremovexattr, .removexattr = dht_removexattr, .setxattr = dht_setxattr, .fsetxattr = dht_fsetxattr, @@ -489,6 +70,9 @@ struct xlator_fops fops = { .fxattrop = dht_fxattrop, .setattr = dht_setattr, .fsetattr = dht_fsetattr, + .fallocate = dht_fallocate, + .discard = dht_discard, + .zerofill = dht_zerofill, }; struct xlator_dumpops dumpops = { @@ -502,38 +86,4 @@ struct xlator_cbks cbks = { // .releasedir = dht_releasedir, .forget = dht_forget }; - - -struct volume_options options[] = { - { .key = {"lookup-unhashed"}, - .value = {"auto", "yes", "no", "enable", "disable", "1", "0", - "on", "off"}, - .type = GF_OPTION_TYPE_STR, - .default_value = "on", - }, - { .key = {"min-free-disk"}, - .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, - .default_value = "10%", - .description = "Percentage/Size of disk space that must be " - "kept free." - }, - { .key = {"unhashed-sticky-bit"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - }, - { .key = {"use-readdirp"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "on", - }, - { .key = {"assert-no-child-down"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - }, - { .key = {"directory-layout-spread"}, - .type = GF_OPTION_TYPE_INT, - }, - { .key = {"decommissioned-bricks"}, - .type = GF_OPTION_TYPE_ANY, - }, - { .key = {NULL} }, -}; +; diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c index 2f196951a..e934acdf0 100644 --- a/xlators/cluster/dht/src/nufa.c +++ b/xlators/cluster/dht/src/nufa.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -27,6 +18,8 @@ /* TODO: all 'TODO's in dht.c holds good */ +extern struct volume_options options[]; + int nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, @@ -44,7 +37,6 @@ nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int call_cnt = 0; int ret = 0; - conf = this->private; prev = cookie; @@ -62,7 +54,8 @@ nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret == -1) goto out; - is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); is_dir = check_is_dir (inode, stbuf, xattr); if (!is_dir && !is_linkfile) { @@ -141,7 +134,7 @@ out: err: DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, - inode, stbuf, xattr, NULL); + inode, stbuf, xattr, postparent); return 0; } @@ -211,7 +204,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, * revalidates directly go to the cached-subvolume. */ ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, "Failed to set dict value."); @@ -232,7 +225,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, } else { do_fresh_lookup: ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, "Failed to set dict value."); @@ -241,7 +234,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, } ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht.linkto", 256); + conf->link_xattr_name, 256); if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, "Failed to set dict value."); @@ -260,7 +253,8 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, + NULL); return 0; } @@ -269,7 +263,7 @@ nufa_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; @@ -280,21 +274,21 @@ nufa_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, STACK_WIND (frame, dht_create_cbk, local->cached_subvol, local->cached_subvol->fops->create, - &local->loc, local->flags, local->mode, local->fd, - local->params); + &local->loc, local->flags, local->mode, local->umask, + local->fd, local->params); return 0; err: DHT_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL, NULL); return 0; } int nufa_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) + mode_t umask, fd_t *fd, dict_t *params) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; @@ -329,7 +323,8 @@ nufa_create (call_frame_t *frame, xlator_t *this, if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) { avail_subvol = dht_free_disk_available_subvol (this, - (xlator_t *)conf->private); + (xlator_t *)conf->private, + local); } if (subvol != avail_subvol) { @@ -337,11 +332,10 @@ nufa_create (call_frame_t *frame, xlator_t *this, local->params = dict_ref (params); local->mode = mode; local->flags = flags; - + local->umask = umask; local->cached_subvol = avail_subvol; - dht_linkfile_create (frame, - nufa_create_linkfile_create_cbk, - avail_subvol, subvol, loc); + dht_linkfile_create (frame, nufa_create_linkfile_create_cbk, + this, avail_subvol, subvol, loc); return 0; } @@ -350,14 +344,14 @@ nufa_create (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, dht_create_cbk, subvol, subvol->fops->create, - loc, flags, mode, fd, params); + loc, flags, mode, umask, fd, params); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL, NULL); return 0; } @@ -366,34 +360,39 @@ int nufa_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; local = frame->local; + if (!local || !local->cached_subvol) { + op_errno = EINVAL; + op_ret = -1; + goto err; + } if (op_ret >= 0) { - STACK_WIND (frame, dht_newfile_cbk, - local->cached_subvol, + STACK_WIND_COOKIE (frame, dht_newfile_cbk, + (void *)local->cached_subvol, local->cached_subvol, local->cached_subvol->fops->mknod, &local->loc, local->mode, local->rdev, - local->params); + local->umask, local->params); return 0; } - +err: WIPE (postparent); WIPE (preparent); DHT_STACK_UNWIND (link, frame, op_ret, op_errno, - inode, stbuf, preparent, postparent); + inode, stbuf, preparent, postparent, xdata); return 0; } int nufa_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, dict_t *params) + loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *params) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; @@ -429,7 +428,8 @@ nufa_mknod (call_frame_t *frame, xlator_t *this, if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) { avail_subvol = dht_free_disk_available_subvol (this, - (xlator_t *)conf->private); + (xlator_t *)conf->private, + local); } if (avail_subvol != subvol) { @@ -437,10 +437,11 @@ nufa_mknod (call_frame_t *frame, xlator_t *this, local->params = dict_ref (params); local->mode = mode; + local->umask = umask; local->rdev = rdev; local->cached_subvol = avail_subvol; - dht_linkfile_create (frame, nufa_mknod_linkfile_cbk, + dht_linkfile_create (frame, nufa_mknod_linkfile_cbk, this, avail_subvol, subvol, loc); return 0; } @@ -448,211 +449,185 @@ nufa_mknod (call_frame_t *frame, xlator_t *this, gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev, params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, rdev, umask, + params); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (mknod, frame, -1, op_errno, - NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); return 0; } -int -notify (xlator_t *this, int event, void *data, ...) +gf_boolean_t +same_first_part (char *str1, char term1, char *str2, char term2) { - int ret = -1; - - ret = dht_notify (this, event, data); - - return ret; + gf_boolean_t ended1; + gf_boolean_t ended2; + + for (;;) { + ended1 = ((*str1 == '\0') || (*str1 == term1)); + ended2 = ((*str2 == '\0') || (*str2 == term2)); + if (ended1 && ended2) { + return _gf_true; + } + if (ended1 || ended2 || (*str1 != *str2)) { + return _gf_false; + } + ++str1; + ++str2; + } } -void -fini (xlator_t *this) -{ - int i = 0; - dht_conf_t *conf = NULL; +typedef struct nufa_args { + xlator_t *this; + char *volname; + gf_boolean_t addr_match; +} nufa_args_t; - conf = this->private; +static void +nufa_find_local_brick (xlator_t *xl, void *data) +{ + nufa_args_t *args = data; + xlator_t *this = args->this; + char *local_volname = args->volname; + gf_boolean_t addr_match = args->addr_match; + char *brick_host = NULL; + dht_conf_t *conf = this->private; + int ret = -1; + + /*This means a local subvol was already found. We pick the first brick + * that is local*/ + if (conf->private) + return; + + if (strcmp (xl->name, local_volname) == 0) { + conf->private = xl; + gf_log (this->name, GF_LOG_INFO, "Using specified subvol %s", + local_volname); + return; + } - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } + if (!addr_match) + return; - if (conf->subvolumes) - GF_FREE (conf->subvolumes); + ret = dict_get_str (xl->options, "remote-host", &brick_host); + if ((ret == 0) && + (gf_is_same_address (local_volname, brick_host) || + gf_is_local_addr (brick_host))) { + conf->private = xl; + gf_log (this->name, GF_LOG_INFO, "Using the first local " + "subvol %s", xl->name); + return; + } - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); +} - GF_FREE (conf); - } +static void +nufa_to_dht (xlator_t *this) +{ + GF_ASSERT (this); + GF_ASSERT (this->fops); - return; + this->fops->lookup = dht_lookup; + this->fops->create = dht_create; + this->fops->mknod = dht_mknod; } int -init (xlator_t *this) +nufa_find_local_subvol (xlator_t *this, + void (*fn) (xlator_t *each, void* data), void *data) { - dht_conf_t *conf = NULL; - xlator_list_t *trav = NULL; - data_t *data = NULL; - char *local_volname = NULL; - char *temp_str = NULL; - int ret = -1; - int i = 0; - char my_hostname[256]; - uint32_t temp_free_disk = 0; - - if (!this->children) { - gf_log (this->name, GF_LOG_CRITICAL, - "NUFA needs more than one subvolume"); + int ret = -1; + dht_conf_t *conf = this->private; + xlator_list_t *trav = NULL; + xlator_t *parent = NULL; + xlator_t *candidate = NULL; + + xlator_foreach_depth_first (this, fn, data); + if (!conf->private) { + gf_log (this->name, GF_LOG_ERROR, "Couldn't find a local " + "brick"); return -1; } - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile"); - } - - conf = GF_CALLOC (1, sizeof (*conf), - gf_dht_mt_dht_conf_t); - if (!conf) { - goto err; - } - - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; - if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean */ - if (strcasecmp (temp_str, "auto")) - gf_string2boolean (temp_str, &conf->search_unhashed); - else - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; - } + candidate = conf->private; + trav = candidate->parents; + while (trav) { - ret = dht_init_subvolumes (this, conf); - if (ret == -1) { - goto err; - } + parent = trav->xlator; + if (strcmp (parent->type, "cluster/nufa") == 0) { + gf_log (this->name, GF_LOG_INFO, "Found local subvol, " + "%s", candidate->name); + ret = 0; + conf->private = candidate; + break; + } - ret = dht_layouts_init (this, conf); - if (ret == -1) { - goto err; + candidate = parent; + trav = parent->parents; } - LOCK_INIT (&conf->subvolume_lock); - LOCK_INIT (&conf->layout_lock); + return ret; +} - conf->gen = 1; +int +nufa_init (xlator_t *this) +{ + data_t *data = NULL; + char *local_volname = NULL; + int ret = -1; + char my_hostname[256]; + gf_boolean_t addr_match = _gf_false; + nufa_args_t args = {0, }; - local_volname = "localhost"; - ret = gethostname (my_hostname, 256); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "could not find hostname (%s)", - strerror (errno)); + ret = dht_init(this); + if (ret) { + return ret; } - if (ret == 0) - local_volname = my_hostname; - - data = dict_get (this->options, "local-volume-name"); - if (data) { + if ((data = dict_get (this->options, "local-volume-name"))) { local_volname = data->data; - } - trav = this->children; - while (trav) { - if (strcmp (trav->xlator->name, local_volname) == 0) - break; - trav = trav->next; - } + } else { + addr_match = _gf_true; + local_volname = "localhost"; + ret = gethostname (my_hostname, 256); + if (ret == 0) + local_volname = my_hostname; - if (!trav) { - gf_log (this->name, GF_LOG_ERROR, - "Could not find subvolume named '%s'. " - "Please define volume with the name as the hostname " - "or override it with 'option local-volume-name'", - local_volname); - goto err; - } - /* The volume specified exists */ - conf->private = trav->xlator; - - conf->min_free_disk = 10; - conf->disk_unit = 'p'; - - if (dict_get_str (this->options, "min-free-disk", - &temp_str) == 0) { - if (gf_string2percent (temp_str, - &temp_free_disk) == 0) { - if (temp_free_disk > 100) { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } else { - conf->min_free_disk = (uint64_t)temp_free_disk; - conf->disk_unit = 'p'; - } - } else { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } - } + else + gf_log (this->name, GF_LOG_WARNING, + "could not find hostname (%s)", + strerror (errno)); - conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t), - gf_dht_mt_dht_du_t); - if (!conf->du_stats) { - goto err; } - /* Create 'syncop' environment */ - conf->env = syncenv_new (0); - if (!conf->env) { - gf_log (this->name, GF_LOG_ERROR, - "failed to create sync environment %s", - strerror (errno)); - goto err; + args.this = this; + args.volname = local_volname; + args.addr_match = addr_match; + ret = nufa_find_local_subvol (this, nufa_find_local_brick, &args); + if (ret) { + gf_log (this->name, GF_LOG_INFO, + "Unable to find local subvolume, switching " + "to dht mode"); + nufa_to_dht (this); } - - this->private = conf; - return 0; +} -err: - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - if (conf->subvolumes) - GF_FREE (conf->subvolumes); - - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); - - if (conf->du_stats) - GF_FREE (conf->du_stats); - - GF_FREE (conf); - } - return -1; -} +class_methods_t class_methods = { + .init = nufa_init, + .fini = dht_fini, + .reconfigure = dht_reconfigure, + .notify = dht_notify +}; struct xlator_fops fops = { @@ -699,19 +674,3 @@ struct xlator_fops fops = { struct xlator_cbks cbks = { .forget = dht_forget }; - - -struct volume_options options[] = { - { .key = {"lookup-unhashed"}, - .value = {"auto", "yes", "no", "enable", "disable", "1", "0", - "on", "off"}, - .type = GF_OPTION_TYPE_STR - }, - { .key = {"local-volume-name"}, - .type = GF_OPTION_TYPE_XLATOR - }, - { .key = {"min-free-disk"}, - .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, - }, - { .key = {NULL} }, -}; diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c index fd3f22ea0..d3ea90ba8 100644 --- a/xlators/cluster/dht/src/switch.c +++ b/xlators/cluster/dht/src/switch.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -31,6 +22,8 @@ #include <fnmatch.h> #include <string.h> +extern struct volume_options options[]; + struct switch_sched_array { xlator_t *xl; int32_t eligible; @@ -76,29 +69,37 @@ get_switch_matching_subvol (const char *path, dht_conf_t *conf, struct switch_struct *cond = NULL; struct switch_struct *trav = NULL; char *pathname = NULL; - int idx = 0; + int idx = 0; + xlator_t *subvol = NULL; cond = conf->private; + subvol = hashed_subvol; if (!cond) - return hashed_subvol; + goto out; - trav = cond; pathname = gf_strdup (path); + if (!pathname) + goto out; + + trav = cond; while (trav) { if (fnmatch (trav->path_pattern, pathname, FNM_NOESCAPE) == 0) { for (idx = 0; idx < trav->num_child; idx++) { if (trav->array[idx].xl == hashed_subvol) - return hashed_subvol; + goto out; } idx = trav->node_index++; trav->node_index %= trav->num_child; - return trav->array[idx].xl; + subvol = trav->array[idx].xl; + goto out; } trav = trav->next; } +out: GF_FREE (pathname); - return hashed_subvol; + + return subvol; } @@ -136,7 +137,8 @@ switch_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret == -1) goto out; - is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); is_dir = check_is_dir (inode, stbuf, xattr); if (!is_dir && !is_linkfile) { @@ -290,11 +292,11 @@ switch_lookup (call_frame_t *frame, xlator_t *this, * attribute, revalidates directly go to the cached-subvolume. */ ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); if (ret < 0) gf_log (this->name, GF_LOG_WARNING, - "failed to set dict value for " - "trusted.glusterfs.dht"); + "failed to set dict value for %s", + conf->xattr_name); for (i = 0; i < layout->cnt; i++) { subvol = layout->list[i].xlator; @@ -309,18 +311,18 @@ switch_lookup (call_frame_t *frame, xlator_t *this, } else { do_fresh_lookup: ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); if (ret < 0) gf_log (this->name, GF_LOG_WARNING, - "failed to set dict value for " - "trusted.glusterfs.dht"); + "failed to set dict value for %s", + conf->xattr_name); ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht.linkto", 256); + conf->link_xattr_name, 256); if (ret < 0) gf_log (this->name, GF_LOG_WARNING, - "failed to set dict value for " - "trusted.glusterfs.dht.linkto"); + "failed to set dict value for %s", + conf->link_xattr_name); if (!hashed_subvol) { gf_log (this->name, GF_LOG_DEBUG, @@ -366,7 +368,8 @@ switch_lookup (call_frame_t *frame, xlator_t *this, err: op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND (lookup, frame, -1, op_errno, + NULL, NULL, NULL, NULL); return 0; } @@ -375,7 +378,7 @@ switch_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; @@ -386,21 +389,21 @@ switch_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, STACK_WIND (frame, dht_create_cbk, local->cached_subvol, local->cached_subvol->fops->create, - &local->loc, local->flags, local->mode, local->fd, - local->params); + &local->loc, local->flags, local->mode, local->umask, + local->fd, local->params); return 0; err: DHT_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL, NULL); return 0; } int switch_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) + mode_t umask, fd_t *fd, dict_t *params) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; @@ -434,18 +437,18 @@ switch_create (call_frame_t *frame, xlator_t *this, avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol); if (dht_is_subvol_filled (this, avail_subvol)) { avail_subvol = - dht_free_disk_available_subvol (this, avail_subvol); + dht_free_disk_available_subvol (this, avail_subvol, + local); } if (subvol != avail_subvol) { /* create a link file instead of actual file */ local->mode = mode; local->flags = flags; - + local->umask = umask; local->cached_subvol = avail_subvol; - dht_linkfile_create (frame, - switch_create_linkfile_create_cbk, - avail_subvol, subvol, loc); + dht_linkfile_create (frame, switch_create_linkfile_create_cbk, + this, avail_subvol, subvol, loc); return 0; } @@ -454,14 +457,14 @@ switch_create (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, dht_create_cbk, subvol, subvol->fops->create, - loc, flags, mode, fd, params); + loc, flags, mode, umask, fd, params); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL, NULL); return 0; } @@ -470,31 +473,36 @@ int switch_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; local = frame->local; + if (!local || !local->cached_subvol) { + op_errno = EINVAL; + op_ret = -1; + goto err; + } if (op_ret >= 0) { - STACK_WIND (frame, dht_newfile_cbk, - local->cached_subvol, + STACK_WIND_COOKIE (frame, dht_newfile_cbk, + (void *)local->cached_subvol, local->cached_subvol, local->cached_subvol->fops->mknod, &local->loc, local->mode, local->rdev, - local->params); + local->umask, local->params); return 0; } - +err: DHT_STACK_UNWIND (link, frame, op_ret, op_errno, - inode, stbuf, preparent, postparent); + inode, stbuf, preparent, postparent, xdata); return 0; } int -switch_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, dict_t *params) +switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *params) { dht_local_t *local = NULL; dht_conf_t *conf = NULL; @@ -529,7 +537,8 @@ switch_mknod (call_frame_t *frame, xlator_t *this, avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol); if (dht_is_subvol_filled (this, avail_subvol)) { avail_subvol = - dht_free_disk_available_subvol (this, avail_subvol); + dht_free_disk_available_subvol (this, avail_subvol, + local); } if (avail_subvol != subvol) { @@ -537,46 +546,36 @@ switch_mknod (call_frame_t *frame, xlator_t *this, local->params = dict_ref (params); local->mode = mode; + local->umask = umask; local->rdev = rdev; local->cached_subvol = avail_subvol; dht_linkfile_create (frame, switch_mknod_linkfile_cbk, - avail_subvol, subvol, loc); + this, avail_subvol, subvol, loc); return 0; } gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev, params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, rdev, umask, + params); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (mknod, frame, -1, op_errno, - NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); return 0; } -int -notify (xlator_t *this, int event, void *data, ...) -{ - int ret = -1; - - ret = dht_notify (this, event, data); - - return ret; -} - void -fini (xlator_t *this) +switch_fini (xlator_t *this) { - int i = 0; dht_conf_t *conf = NULL; struct switch_struct *trav = NULL; struct switch_struct *prev = NULL; @@ -587,30 +586,14 @@ fini (xlator_t *this) trav = (struct switch_struct *)conf->private; conf->private = NULL; while (trav) { - if (trav->array) - GF_FREE (trav->array); + GF_FREE (trav->array); prev = trav; trav = trav->next; GF_FREE (prev); } - - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - if (conf->subvolumes) - GF_FREE (conf->subvolumes); - - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); - - GF_FREE (conf); } - return; + dht_fini(this); } int @@ -670,8 +653,10 @@ set_switch_pattern (xlator_t *this, dht_conf_t *conf, dup_str = gf_strdup (switch_str); switch_opt = GF_CALLOC (1, sizeof (struct switch_struct), gf_switch_mt_switch_struct); - if (!switch_opt) + if (!switch_opt) { + GF_FREE (dup_str); goto err; + } pattern = strtok_r (dup_str, ":", &tmp_str1); childs = strtok_r (NULL, ":", &tmp_str1); @@ -681,6 +666,7 @@ set_switch_pattern (xlator_t *this, dht_conf_t *conf, "for all the unconfigured child nodes," " hence neglecting current option"); switch_str = strtok_r (NULL, ";", &tmp_str); + GF_FREE (switch_opt); GF_FREE (dup_str); continue; } @@ -753,6 +739,7 @@ set_switch_pattern (xlator_t *this, dht_conf_t *conf, /* First entry */ switch_buf = switch_opt; } + switch_opt = NULL; switch_str = strtok_r (NULL, ";", &tmp_str); } @@ -809,19 +796,20 @@ set_switch_pattern (xlator_t *this, dht_conf_t *conf, /* First entry */ switch_buf = switch_opt; } + switch_opt = NULL; } /* */ conf->private = switch_buf; return 0; err: + GF_FREE (switch_buf_array); + GF_FREE (switch_opt); + if (switch_buf) { - if (switch_buf_array) - GF_FREE (switch_buf_array); trav = switch_buf; while (trav) { - if (trav->array) - GF_FREE (trav->array); + GF_FREE (trav->array); switch_opt = trav; trav = trav->next; GF_FREE (switch_opt); @@ -831,68 +819,18 @@ err: } -int -init (xlator_t *this) +int32_t +switch_init (xlator_t *this) { dht_conf_t *conf = NULL; data_t *data = NULL; - char *temp_str = NULL; int ret = -1; - int i = 0; - uint32_t temp_free_disk = 0; - - if (!this->children) { - gf_log (this->name, GF_LOG_CRITICAL, - "SWITCH needs more than one subvolume"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile"); - } - - conf = GF_CALLOC (1, sizeof (*conf), gf_switch_mt_dht_conf_t); - if (!conf) { - goto err; - } - - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; - if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean */ - if (strcasecmp (temp_str, "auto")) - gf_string2boolean (temp_str, &conf->search_unhashed); - else - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; - } - conf->unhashed_sticky_bit = 0; - if (dict_get_str (this->options, "unhashed-sticky-bit", - &temp_str) == 0) { - gf_string2boolean (temp_str, &conf->unhashed_sticky_bit); - } - - conf->min_free_disk = 10; - conf->disk_unit = 'p'; - - if (dict_get_str (this->options, "min-free-disk", - &temp_str) == 0) { - if (gf_string2percent (temp_str, - &temp_free_disk) == 0) { - if (temp_free_disk > 100) { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } else { - conf->min_free_disk = (uint64_t)temp_free_disk; - conf->disk_unit = 'p'; - } - } else { - gf_string2bytesize (temp_str, - &conf->min_free_disk); - conf->disk_unit = 'b'; - } + ret = dht_init(this); + if (ret) { + return ret; } + conf = this->private; data = dict_get (this->options, "pattern.switch.case"); if (data) { @@ -903,65 +841,23 @@ init (xlator_t *this) } } - ret = dht_init_subvolumes (this, conf); - if (ret == -1) { - goto err; - } - - ret = dht_layouts_init (this, conf); - if (ret == -1) { - goto err; - } - - LOCK_INIT (&conf->subvolume_lock); - LOCK_INIT (&conf->layout_lock); - - conf->gen = 1; - - conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t), - gf_switch_mt_dht_du_t); - if (!conf->du_stats) { - goto err; - } - - /* Create 'syncop' environment */ - conf->env = syncenv_new (0); - if (!conf->env) { - gf_log (this->name, GF_LOG_ERROR, - "failed to create sync environment %s", - strerror (errno)); - goto err; - } - this->private = conf; - return 0; err: - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - if (conf->subvolumes) - GF_FREE (conf->subvolumes); - - if (conf->subvolume_status) - GF_FREE (conf->subvolume_status); - - if (conf->du_stats) - GF_FREE (conf->du_stats); - - GF_FREE (conf); - } - + dht_fini(this); return -1; } +class_methods_t class_methods = { + .init = switch_init, + .fini = switch_fini, + .reconfigure = dht_reconfigure, + .notify = dht_notify +}; + + struct xlator_fops fops = { .lookup = switch_lookup, .create = switch_create, @@ -1006,19 +902,3 @@ struct xlator_fops fops = { struct xlator_cbks cbks = { .forget = dht_forget }; - - -struct volume_options options[] = { - { .key = {"lookup-unhashed"}, - .value = {"auto", "yes", "no", "enable", "disable", "1", "0", - "on", "off"}, - .type = GF_OPTION_TYPE_STR - }, - { .key = {"pattern.switch.case"}, - .type = GF_OPTION_TYPE_ANY - }, - { .key = {"min-free-disk"}, - .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, - }, - { .key = {NULL} }, -}; diff --git a/xlators/cluster/ha/src/Makefile.am b/xlators/cluster/ha/src/Makefile.am index 5f78a2965..5c1364b7f 100644 --- a/xlators/cluster/ha/src/Makefile.am +++ b/xlators/cluster/ha/src/Makefile.am @@ -1,15 +1,16 @@ xlator_LTLIBRARIES = ha.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/cluster -ha_la_LDFLAGS = -module -avoidversion +ha_la_LDFLAGS = -module -avoid-version ha_la_SOURCES = ha-helpers.c ha.c ha_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la noinst_HEADERS = ha.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/cluster/ha/src/ha-helpers.c b/xlators/cluster/ha/src/ha-helpers.c index 1e4af1b62..19be1ed27 100644 --- a/xlators/cluster/ha/src/ha-helpers.c +++ b/xlators/cluster/ha/src/ha-helpers.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #include "xlator.h" #include "call-stub.h" #include "defaults.h" diff --git a/xlators/cluster/ha/src/ha-mem-types.h b/xlators/cluster/ha/src/ha-mem-types.h index 9bfb3972b..e5e97d237 100644 --- a/xlators/cluster/ha/src/ha-mem-types.h +++ b/xlators/cluster/ha/src/ha-mem-types.h @@ -1,24 +1,13 @@ - /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef __HA_MEM_TYPES_H__ #define __HA_MEM_TYPES_H__ diff --git a/xlators/cluster/ha/src/ha.c b/xlators/cluster/ha/src/ha.c index 38d4229d3..3eccb516b 100644 --- a/xlators/cluster/ha/src/ha.c +++ b/xlators/cluster/ha/src/ha.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ /* generate errors randomly, code is simple now, better alogorithm * can be written to decide what error to be returned and when */ @@ -1876,13 +1866,9 @@ err: } if (hafdp) { - if (hafdp->fdstate) { - GF_FREE (hafdp->fdstate); - } + GF_FREE (hafdp->fdstate); - if (hafdp->path) { - GF_FREE (hafdp->path); - } + GF_FREE (hafdp->path); GF_FREE (hafdp); } diff --git a/xlators/cluster/ha/src/ha.h b/xlators/cluster/ha/src/ha.h index 39b6851e7..e2ed7eaa6 100644 --- a/xlators/cluster/ha/src/ha.h +++ b/xlators/cluster/ha/src/ha.h @@ -1,22 +1,12 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #ifndef __HA_H_ #define __HA_H_ diff --git a/xlators/cluster/map/src/Makefile.am b/xlators/cluster/map/src/Makefile.am index 26e19137a..a278b05e2 100644 --- a/xlators/cluster/map/src/Makefile.am +++ b/xlators/cluster/map/src/Makefile.am @@ -1,15 +1,16 @@ xlator_LTLIBRARIES = map.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/cluster -map_la_LDFLAGS = -module -avoidversion +map_la_LDFLAGS = -module -avoid-version map_la_SOURCES = map.c map-helper.c map_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la noinst_HEADERS = map.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/cluster/map/src/map-helper.c b/xlators/cluster/map/src/map-helper.c index 81212fcfd..851397b68 100644 --- a/xlators/cluster/map/src/map-helper.c +++ b/xlators/cluster/map/src/map-helper.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2009-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2009-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" diff --git a/xlators/cluster/map/src/map-mem-types.h b/xlators/cluster/map/src/map-mem-types.h index 669b93dc2..3e89f4736 100644 --- a/xlators/cluster/map/src/map-mem-types.h +++ b/xlators/cluster/map/src/map-mem-types.h @@ -1,24 +1,13 @@ - /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef __MAP_MEM_TYPES_H__ #define __MAP_MEM_TYPES_H__ diff --git a/xlators/cluster/map/src/map.c b/xlators/cluster/map/src/map.c index ead9da0b9..6150a33ce 100644 --- a/xlators/cluster/map/src/map.c +++ b/xlators/cluster/map/src/map.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" @@ -2375,8 +2365,7 @@ fini (xlator_t *this) priv = this->private; if (priv) { - if (priv->xlarray) - GF_FREE (priv->xlarray); + GF_FREE (priv->xlarray); trav_map = priv->map; while (trav_map) { diff --git a/xlators/cluster/map/src/map.h b/xlators/cluster/map/src/map.h index bccac437c..7703a543e 100644 --- a/xlators/cluster/map/src/map.h +++ b/xlators/cluster/map/src/map.h @@ -1,22 +1,12 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef __MAP_H__ #define __MAP_H__ diff --git a/xlators/cluster/stripe/src/Makefile.am b/xlators/cluster/stripe/src/Makefile.am index 0db3c9eeb..2d151422a 100644 --- a/xlators/cluster/stripe/src/Makefile.am +++ b/xlators/cluster/stripe/src/Makefile.am @@ -2,16 +2,19 @@ xlator_LTLIBRARIES = stripe.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster -stripe_la_LDFLAGS = -module -avoidversion +stripe_la_LDFLAGS = -module -avoid-version + +stripe_la_SOURCES = stripe.c stripe-helpers.c \ + $(top_builddir)/xlators/lib/src/libxlator.c -stripe_la_SOURCES = stripe.c $(top_builddir)/xlators/lib/src/libxlator.c stripe_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la noinst_HEADERS = stripe.h stripe-mem-types.h $(top_builddir)/xlators/lib/src/libxlator.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) \ +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ -I$(top_srcdir)/xlators/lib/src +AM_CFLAGS = -Wall $(GF_CFLAGS) + CLEANFILES = diff --git a/xlators/cluster/stripe/src/stripe-helpers.c b/xlators/cluster/stripe/src/stripe-helpers.c new file mode 100644 index 000000000..a83abdc72 --- /dev/null +++ b/xlators/cluster/stripe/src/stripe-helpers.c @@ -0,0 +1,675 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <fnmatch.h> + +#include "stripe.h" +#include "byte-order.h" +#include "mem-types.h" + +void +stripe_local_wipe (stripe_local_t *local) +{ + if (!local) + goto out; + + loc_wipe (&local->loc); + loc_wipe (&local->loc2); + + if (local->fd) + fd_unref (local->fd); + + if (local->inode) + inode_unref (local->inode); + + if (local->xattr) + dict_unref (local->xattr); + + if (local->xdata) + dict_unref (local->xdata); + +out: + return; +} + + + +int +stripe_aggregate (dict_t *this, char *key, data_t *value, void *data) +{ + dict_t *dst = NULL; + int64_t *ptr = 0, *size = NULL; + int32_t ret = -1; + + dst = data; + + if (strcmp (key, GF_XATTR_QUOTA_SIZE_KEY) == 0) { + ret = dict_get_bin (dst, key, (void **)&size); + if (ret < 0) { + size = GF_CALLOC (1, sizeof (int64_t), + gf_common_mt_char); + if (size == NULL) { + gf_log ("stripe", GF_LOG_WARNING, + "memory allocation failed"); + goto out; + } + ret = dict_set_bin (dst, key, size, sizeof (int64_t)); + if (ret < 0) { + gf_log ("stripe", GF_LOG_WARNING, + "stripe aggregate dict set failed"); + GF_FREE (size); + goto out; + } + } + + ptr = data_to_bin (value); + if (ptr == NULL) { + gf_log ("stripe", GF_LOG_WARNING, "data to bin failed"); + goto out; + } + + *size = hton64 (ntoh64 (*size) + ntoh64 (*ptr)); + } else if (strcmp (key, GF_CONTENT_KEY)) { + /* No need to aggregate 'CONTENT' data */ + ret = dict_set (dst, key, value); + if (ret) + gf_log ("stripe", GF_LOG_WARNING, "xattr dict set failed"); + } + +out: + return 0; +} + + +void +stripe_aggregate_xattr (dict_t *dst, dict_t *src) +{ + if ((dst == NULL) || (src == NULL)) { + goto out; + } + + dict_foreach (src, stripe_aggregate, dst); +out: + return; +} + + +int32_t +stripe_xattr_aggregate (char *buffer, stripe_local_t *local, int32_t *total) +{ + int32_t i = 0; + int32_t ret = -1; + int32_t len = 0; + char *sbuf = NULL; + stripe_xattr_sort_t *xattr = NULL; + + if (!buffer || !local || !local->xattr_list) + goto out; + + sbuf = buffer; + + for (i = 0; i < local->nallocs; i++) { + xattr = local->xattr_list + i; + len = xattr->xattr_len; + + if (len && xattr && xattr->xattr_value) { + memcpy (buffer, xattr->xattr_value, len); + buffer += len; + *buffer++ = ' '; + } + } + + *--buffer = '\0'; + if (total) + *total = buffer - sbuf; + ret = 0; + + out: + return ret; +} + +int32_t +stripe_free_xattr_str (stripe_local_t *local) +{ + int32_t i = 0; + int32_t ret = -1; + stripe_xattr_sort_t *xattr = NULL; + + if (!local || !local->xattr_list) + goto out; + + for (i = 0; i < local->nallocs; i++) { + xattr = local->xattr_list + i; + + if (xattr && xattr->xattr_value) + GF_FREE (xattr->xattr_value); + } + + ret = 0; + out: + return ret; +} + + +int32_t +stripe_fill_lockinfo_xattr (xlator_t *this, stripe_local_t *local, + void **xattr_serz) +{ + int32_t ret = -1, i = 0, len = 0; + dict_t *tmp1 = NULL, *tmp2 = NULL; + char *buf = NULL; + stripe_xattr_sort_t *xattr = NULL; + + if (xattr_serz == NULL) { + goto out; + } + + tmp2 = dict_new (); + + if (tmp2 == NULL) { + goto out; + } + + for (i = 0; i < local->nallocs; i++) { + xattr = local->xattr_list + i; + len = xattr->xattr_len; + + if (len && xattr && xattr->xattr_value) { + ret = dict_reset (tmp2); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "dict_reset failed (%s)", + strerror (-ret)); + } + + ret = dict_unserialize (xattr->xattr_value, + xattr->xattr_len, + &tmp2); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "dict_unserialize failed (%s)", + strerror (-ret)); + ret = -1; + goto out; + } + + tmp1 = dict_copy (tmp2, tmp1); + if (tmp1 == NULL) { + gf_log (this->name, GF_LOG_WARNING, + "dict_copy failed (%s)", + strerror (-ret)); + ret = -1; + goto out; + } + } + } + + len = dict_serialized_length (tmp1); + if (len > 0) { + buf = GF_CALLOC (1, len, gf_common_mt_dict_t); + if (buf == NULL) { + ret = -1; + goto out; + } + + ret = dict_serialize (tmp1, buf); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "dict_serialize failed (%s)", strerror (-ret)); + ret = -1; + goto out; + } + + *xattr_serz = buf; + } + + ret = 0; +out: + if (tmp1 != NULL) { + dict_unref (tmp1); + } + + if (tmp2 != NULL) { + dict_unref (tmp2); + } + + return ret; +} + + +int32_t +stripe_fill_pathinfo_xattr (xlator_t *this, stripe_local_t *local, + char **xattr_serz) +{ + int ret = -1; + int32_t padding = 0; + int32_t tlen = 0; + char stripe_size_str[20] = {0,}; + char *pathinfo_serz = NULL; + + if (!local) { + gf_log (this->name, GF_LOG_ERROR, "Possible NULL deref"); + goto out; + } + + (void) snprintf (stripe_size_str, 20, "%ld", + (local->fctx) ? local->fctx->stripe_size : 0); + + /* extra bytes for decorations (brackets and <>'s) */ + padding = strlen (this->name) + strlen (STRIPE_PATHINFO_HEADER) + + strlen (stripe_size_str) + 7; + local->xattr_total_len += (padding + 2); + + pathinfo_serz = GF_CALLOC (local->xattr_total_len, sizeof (char), + gf_common_mt_char); + if (!pathinfo_serz) + goto out; + + /* xlator info */ + (void) sprintf (pathinfo_serz, "(<"STRIPE_PATHINFO_HEADER"%s:[%s]> ", + this->name, stripe_size_str); + + ret = stripe_xattr_aggregate (pathinfo_serz + padding, local, &tlen); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Cannot aggregate pathinfo list"); + goto out; + } + + *(pathinfo_serz + padding + tlen) = ')'; + *(pathinfo_serz + padding + tlen + 1) = '\0'; + + *xattr_serz = pathinfo_serz; + + ret = 0; + out: + return ret; +} + +/** + * stripe_get_matching_bs - Get the matching block size for the given path. + */ +int32_t +stripe_get_matching_bs (const char *path, stripe_private_t *priv) +{ + struct stripe_options *trav = NULL; + uint64_t block_size = 0; + + GF_VALIDATE_OR_GOTO ("stripe", priv, out); + GF_VALIDATE_OR_GOTO ("stripe", path, out); + + LOCK (&priv->lock); + { + block_size = priv->block_size; + trav = priv->pattern; + while (trav) { + if (!fnmatch (trav->path_pattern, path, FNM_NOESCAPE)) { + block_size = trav->block_size; + break; + } + trav = trav->next; + } + } + UNLOCK (&priv->lock); + +out: + return block_size; +} + +int32_t +stripe_ctx_handle (xlator_t *this, call_frame_t *prev, stripe_local_t *local, + dict_t *dict) +{ + char key[256] = {0,}; + data_t *data = NULL; + int32_t index = 0; + stripe_private_t *priv = NULL; + + priv = this->private; + + + if (!local->fctx) { + local->fctx = GF_CALLOC (1, sizeof (stripe_fd_ctx_t), + gf_stripe_mt_stripe_fd_ctx_t); + if (!local->fctx) { + local->op_errno = ENOMEM; + local->op_ret = -1; + goto out; + } + + local->fctx->static_array = 0; + } + /* Stripe block size */ + sprintf (key, "trusted.%s.stripe-size", this->name); + data = dict_get (dict, key); + if (!data) { + local->xattr_self_heal_needed = 1; + gf_log (this->name, GF_LOG_ERROR, + "Failed to get stripe-size"); + goto out; + } else { + if (!local->fctx->stripe_size) { + local->fctx->stripe_size = + data_to_int64 (data); + } + + if (local->fctx->stripe_size != data_to_int64 (data)) { + gf_log (this->name, GF_LOG_WARNING, + "stripe-size mismatch in blocks"); + local->xattr_self_heal_needed = 1; + } + } + + /* Stripe count */ + sprintf (key, "trusted.%s.stripe-count", this->name); + data = dict_get (dict, key); + + if (!data) { + local->xattr_self_heal_needed = 1; + gf_log (this->name, GF_LOG_ERROR, + "Failed to get stripe-count"); + goto out; + } + if (!local->fctx->xl_array) { + local->fctx->stripe_count = data_to_int32 (data); + if (!local->fctx->stripe_count) { + gf_log (this->name, GF_LOG_ERROR, + "error with stripe-count xattr"); + local->op_ret = -1; + local->op_errno = EIO; + goto out; + } + + local->fctx->xl_array = GF_CALLOC (local->fctx->stripe_count, + sizeof (xlator_t *), + gf_stripe_mt_xlator_t); + + if (!local->fctx->xl_array) { + local->op_errno = ENOMEM; + local->op_ret = -1; + goto out; + } + } + if (local->fctx->stripe_count != data_to_int32 (data)) { + gf_log (this->name, GF_LOG_ERROR, + "error with stripe-count xattr (%d != %d)", + local->fctx->stripe_count, data_to_int32 (data)); + local->op_ret = -1; + local->op_errno = EIO; + goto out; + } + + /* index */ + sprintf (key, "trusted.%s.stripe-index", this->name); + data = dict_get (dict, key); + if (!data) { + local->xattr_self_heal_needed = 1; + gf_log (this->name, GF_LOG_ERROR, + "Failed to get stripe-index"); + goto out; + } + index = data_to_int32 (data); + if (index > priv->child_count) { + gf_log (this->name, GF_LOG_ERROR, + "error with stripe-index xattr (%d)", index); + local->op_ret = -1; + local->op_errno = EIO; + goto out; + } + if (local->fctx->xl_array) { + if (!local->fctx->xl_array[index]) + local->fctx->xl_array[index] = prev->this; + } + + sprintf(key, "trusted.%s.stripe-coalesce", this->name); + data = dict_get(dict, key); + if (!data) { + /* + * The file was probably created prior to coalesce support. + * Assume non-coalesce mode for this file to maintain backwards + * compatibility. + */ + gf_log(this->name, GF_LOG_DEBUG, "missing stripe-coalesce " + "attr, assume non-coalesce mode"); + local->fctx->stripe_coalesce = 0; + } else { + local->fctx->stripe_coalesce = data_to_int32(data); + } + + +out: + return 0; +} + +int32_t +stripe_xattr_request_build (xlator_t *this, dict_t *dict, uint64_t stripe_size, + uint32_t stripe_count, uint32_t stripe_index, + uint32_t stripe_coalesce) +{ + char key[256] = {0,}; + int32_t ret = -1; + + sprintf (key, "trusted.%s.stripe-size", this->name); + ret = dict_set_int64 (dict, key, stripe_size); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set %s in xattr_req dict", key); + goto out; + } + + sprintf (key, "trusted.%s.stripe-count", this->name); + ret = dict_set_int32 (dict, key, stripe_count); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set %s in xattr_req dict", key); + goto out; + } + + sprintf (key, "trusted.%s.stripe-index", this->name); + ret = dict_set_int32 (dict, key, stripe_index); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set %s in xattr_req dict", key); + goto out; + } + + sprintf(key, "trusted.%s.stripe-coalesce", this->name); + ret = dict_set_int32(dict, key, stripe_coalesce); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, + "failed to set %s in xattr_req_dict", key); + goto out; + } +out: + return ret; +} + + +static int +set_default_block_size (stripe_private_t *priv, char *num) +{ + + int ret = -1; + GF_VALIDATE_OR_GOTO ("stripe", THIS, out); + GF_VALIDATE_OR_GOTO (THIS->name, priv, out); + GF_VALIDATE_OR_GOTO (THIS->name, num, out); + + + if (gf_string2bytesize (num, &priv->block_size) != 0) { + gf_log (THIS->name, GF_LOG_ERROR, + "invalid number format \"%s\"", num); + goto out; + } + + ret = 0; + + out: + return ret; + +} + + +int +set_stripe_block_size (xlator_t *this, stripe_private_t *priv, char *data) +{ + int ret = -1; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *dup_str = NULL; + char *stripe_str = NULL; + char *pattern = NULL; + char *num = NULL; + struct stripe_options *temp_stripeopt = NULL; + struct stripe_options *stripe_opt = NULL; + + if (!this || !priv || !data) + goto out; + + /* Get the pattern for striping. + "option block-size *avi:10MB" etc */ + stripe_str = strtok_r (data, ",", &tmp_str); + while (stripe_str) { + dup_str = gf_strdup (stripe_str); + stripe_opt = GF_CALLOC (1, sizeof (struct stripe_options), + gf_stripe_mt_stripe_options); + if (!stripe_opt) { + goto out; + } + + pattern = strtok_r (dup_str, ":", &tmp_str1); + num = strtok_r (NULL, ":", &tmp_str1); + if (!num) { + num = pattern; + pattern = "*"; + ret = set_default_block_size (priv, num); + if (ret) + goto out; + } + if (gf_string2bytesize (num, &stripe_opt->block_size) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", num); + goto out; + } + + if (stripe_opt->block_size < STRIPE_MIN_BLOCK_SIZE) { + gf_log (this->name, GF_LOG_ERROR, "Invalid Block-size: " + "%s. Should be atleast %llu bytes", num, + STRIPE_MIN_BLOCK_SIZE); + goto out; + } + if (stripe_opt->block_size % 512) { + gf_log (this->name, GF_LOG_ERROR, "Block-size: %s should" + " be a multiple of 512 bytes", num); + goto out; + } + + memcpy (stripe_opt->path_pattern, pattern, strlen (pattern)); + + gf_log (this->name, GF_LOG_DEBUG, + "block-size : pattern %s : size %"PRId64, + stripe_opt->path_pattern, stripe_opt->block_size); + + if (priv->pattern) + temp_stripeopt = NULL; + else + temp_stripeopt = priv->pattern; + + stripe_opt->next = temp_stripeopt; + + priv->pattern = stripe_opt; + stripe_opt = NULL; + + GF_FREE (dup_str); + dup_str = NULL; + + stripe_str = strtok_r (NULL, ",", &tmp_str); + } + + ret = 0; +out: + + GF_FREE (dup_str); + + GF_FREE (stripe_opt); + + return ret; +} + +int32_t +stripe_iatt_merge (struct iatt *from, struct iatt *to) +{ + if (to->ia_size < from->ia_size) + to->ia_size = from->ia_size; + if (to->ia_mtime < from->ia_mtime) + to->ia_mtime = from->ia_mtime; + if (to->ia_ctime < from->ia_ctime) + to->ia_ctime = from->ia_ctime; + if (to->ia_atime < from->ia_atime) + to->ia_atime = from->ia_atime; + return 0; +} + +off_t +coalesced_offset(off_t offset, uint64_t stripe_size, int stripe_count) +{ + size_t line_size = 0; + uint64_t stripe_num = 0; + off_t coalesced_offset = 0; + + line_size = stripe_size * stripe_count; + stripe_num = offset / line_size; + + coalesced_offset = (stripe_num * stripe_size) + + (offset % stripe_size); + + return coalesced_offset; +} + +off_t +uncoalesced_size(off_t size, uint64_t stripe_size, int stripe_count, + int stripe_index) +{ + uint64_t nr_full_stripe_chunks = 0, mod = 0; + + if (!size) + return size; + + /* + * Estimate the number of fully written stripes from the + * local file size. Each stripe_size chunk corresponds to + * a stripe. + */ + nr_full_stripe_chunks = (size / stripe_size) * stripe_count; + mod = size % stripe_size; + + if (!mod) { + /* + * There is no remainder, thus we could have overestimated + * the size of the file in terms of chunks. Trim the number + * of chunks by the following stripe members and leave it + * up to those nodes to respond with a larger size (if + * necessary). + */ + nr_full_stripe_chunks -= stripe_count - + (stripe_index + 1); + size = nr_full_stripe_chunks * stripe_size; + } else { + /* + * There is a remainder and thus we own the last chunk of the + * file. Add the preceding stripe members of the final stripe + * along with the remainder to calculate the exact size. + */ + nr_full_stripe_chunks += stripe_index; + size = nr_full_stripe_chunks * stripe_size + mod; + } + + return size; +} + diff --git a/xlators/cluster/stripe/src/stripe-mem-types.h b/xlators/cluster/stripe/src/stripe-mem-types.h index 29c95c257..e9ac9cf46 100644 --- a/xlators/cluster/stripe/src/stripe-mem-types.h +++ b/xlators/cluster/stripe/src/stripe-mem-types.h @@ -1,21 +1,11 @@ - /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -25,12 +15,12 @@ #include "mem-types.h" enum gf_stripe_mem_types_ { - gf_stripe_mt_stripe_local_t = gf_common_mt_end + 1, - gf_stripe_mt_iovec, - gf_stripe_mt_readv_replies, + gf_stripe_mt_iovec = gf_common_mt_end + 1, + gf_stripe_mt_stripe_replies, gf_stripe_mt_stripe_fd_ctx_t, gf_stripe_mt_char, gf_stripe_mt_int8_t, + gf_stripe_mt_int32_t, gf_stripe_mt_xlator_t, gf_stripe_mt_stripe_private_t, gf_stripe_mt_stripe_options, diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c index 1bea7a733..69b510e23 100644 --- a/xlators/cluster/stripe/src/stripe.c +++ b/xlators/cluster/stripe/src/stripe.c @@ -1,25 +1,16 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ /** * xlators/cluster/stripe: - * Stripe translator, stripes the data accross its child nodes, + * Stripe translator, stripes the data across its child nodes, * as per the options given in the volfile. The striping works * fairly simple. It writes files at different offset as per * calculation. So, 'ls -l' output at the real posix level will @@ -32,6 +23,7 @@ * very much necessary, or else, use it in combination with AFR, to have a * backup copy. */ +#include <fnmatch.h> #include "stripe.h" #include "libxlator.h" @@ -40,73 +32,10 @@ struct volume_options options[]; -void -stripe_local_wipe (stripe_local_t *local) -{ - if (!local) - goto out; - - loc_wipe (&local->loc); - loc_wipe (&local->loc2); - - if (local->fd) - fd_unref (local->fd); - - if (local->inode) - inode_unref (local->inode); - - if (local->xattr) - dict_unref (local->xattr); - - if (local->dict) - dict_unref (local->dict); - -out: - return; -} - -/** - * stripe_get_matching_bs - Get the matching block size for the given path. - */ -int32_t -stripe_get_matching_bs (const char *path, struct stripe_options *opts, - uint64_t default_bs) -{ - struct stripe_options *trav = NULL; - char *pathname = NULL; - uint64_t block_size = 0; - - block_size = default_bs; - - if (!path || !opts) - goto out; - - /* FIXME: is a strdup really necessary? */ - pathname = gf_strdup (path); - if (!pathname) - goto out; - - trav = opts; - while (trav) { - if (!fnmatch (trav->path_pattern, pathname, FNM_NOESCAPE)) { - block_size = trav->block_size; - break; - } - trav = trav->next; - } - - GF_FREE (pathname); - -out: - return block_size; -} - - - int32_t stripe_sh_chown_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) + struct iatt *preop, struct iatt *postop, dict_t *xdata) { int callcnt = -1; stripe_local_t *local = NULL; @@ -135,7 +64,7 @@ int32_t stripe_sh_make_entry_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { stripe_local_t *local = NULL; call_frame_t *prev = NULL; @@ -150,7 +79,7 @@ stripe_sh_make_entry_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STACK_WIND (frame, stripe_sh_chown_cbk, prev->this, prev->this->fops->setattr, &local->loc, - &local->stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID)); + &local->stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL); out: return 0; @@ -164,7 +93,7 @@ stripe_entry_self_heal (call_frame_t *frame, xlator_t *this, call_frame_t *rframe = NULL; stripe_local_t *rlocal = NULL; stripe_private_t *priv = NULL; - dict_t *dict = NULL; + dict_t *xdata = NULL; int ret = 0; if (!local || !this || !frame) { @@ -182,8 +111,7 @@ stripe_entry_self_heal (call_frame_t *frame, xlator_t *this, if (!rframe) { goto out; } - rlocal = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + rlocal = mem_get0 (this->local_pool); if (!rlocal) { goto out; } @@ -192,11 +120,11 @@ stripe_entry_self_heal (call_frame_t *frame, xlator_t *this, loc_copy (&rlocal->loc, &local->loc); memcpy (&rlocal->stbuf, &local->stbuf, sizeof (struct iatt)); - dict = dict_new (); - if (!dict) + xdata = dict_new (); + if (!xdata) goto out; - ret = dict_set_static_bin (dict, "gfid-req", local->stbuf.ia_gfid, 16); + ret = dict_set_static_bin (xdata, "gfid-req", local->stbuf.ia_gfid, 16); if (ret) gf_log (this->name, GF_LOG_WARNING, "%s: failed to set gfid-req", local->loc.path); @@ -207,101 +135,43 @@ stripe_entry_self_heal (call_frame_t *frame, xlator_t *this, trav->xlator, trav->xlator->fops->mknod, &local->loc, st_mode_from_ia (local->stbuf.ia_prot, - local->stbuf.ia_type), 0, - dict); + local->stbuf.ia_type), + 0, 0, xdata); } if (IA_ISDIR (local->stbuf.ia_type)) { STACK_WIND (rframe, stripe_sh_make_entry_cbk, trav->xlator, trav->xlator->fops->mkdir, - &local->loc, st_mode_from_ia (local->stbuf.ia_prot, - local->stbuf.ia_type), - dict); + &local->loc, + st_mode_from_ia (local->stbuf.ia_prot, + local->stbuf.ia_type), + 0, xdata); } trav = trav->next; } - if (dict) - dict_unref (dict); + if (xdata) + dict_unref (xdata); return 0; out: if (rframe) STRIPE_STACK_DESTROY (rframe); - if (dict) - dict_unref (dict); + if (xdata) + dict_unref (xdata); return 0; } -void -stripe_aggregate (dict_t *this, char *key, data_t *value, void *data) -{ - dict_t *dst = NULL; - int64_t *ptr = 0, *size = NULL; - int32_t ret = -1; - - dst = data; - - if (strcmp (key, GF_XATTR_QUOTA_SIZE_KEY) == 0) { - ret = dict_get_bin (dst, key, (void **)&size); - if (ret < 0) { - size = GF_CALLOC (1, sizeof (int64_t), - gf_common_mt_char); - if (size == NULL) { - gf_log ("stripe", GF_LOG_WARNING, - "memory allocation failed"); - goto out; - } - ret = dict_set_bin (dst, key, size, sizeof (int64_t)); - if (ret < 0) { - gf_log ("stripe", GF_LOG_WARNING, - "stripe aggregate dict set failed"); - GF_FREE (size); - goto out; - } - } - - ptr = data_to_bin (value); - if (ptr == NULL) { - gf_log ("stripe", GF_LOG_WARNING, "data to bin failed"); - goto out; - } - - *size = hton64 (ntoh64 (*size) + ntoh64 (*ptr)); - } else if (strcmp (key, GF_CONTENT_KEY)) { - /* No need to aggregate 'CONTENT' data */ - ret = dict_set (dst, key, value); - if (ret) - gf_log ("stripe", GF_LOG_WARNING, "xattr dict set failed"); - } - -out: - return; -} - - -void -stripe_aggregate_xattr (dict_t *dst, dict_t *src) -{ - if ((dst == NULL) || (src == NULL)) { - goto out; - } - - dict_foreach (src, stripe_aggregate, dst); -out: - return; -} - - int32_t stripe_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *dict, struct iatt *postparent) + struct iatt *buf, dict_t *xdata, struct iatt *postparent) { - int32_t callcnt = 0; - stripe_local_t *local = NULL; - call_frame_t *prev = NULL; + int32_t callcnt = 0; + stripe_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = 0; if (!this || !frame || !frame->local || !cookie) { gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); @@ -332,30 +202,42 @@ stripe_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret >= 0) { local->op_ret = 0; + if (IA_ISREG (buf->ia_type)) { + ret = stripe_ctx_handle (this, prev, local, + xdata); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Error getting fctx info from" + " dict"); + } if (FIRST_CHILD(this) == prev->this) { local->stbuf = *buf; local->postparent = *postparent; local->inode = inode_ref (inode); - local->dict = dict_ref (dict); + if (xdata) + local->xdata = dict_ref (xdata); if (local->xattr) { - stripe_aggregate_xattr (local->dict, + stripe_aggregate_xattr (local->xdata, local->xattr); dict_unref (local->xattr); local->xattr = NULL; } } - if (!local->dict && !local->xattr) { - local->xattr = dict_ref (dict); - } else if (local->dict) { - stripe_aggregate_xattr (local->dict, dict); + + if (!local->xdata && !local->xattr) { + local->xattr = dict_ref (xdata); + } else if (local->xdata) { + stripe_aggregate_xattr (local->xdata, xdata); } else if (local->xattr) { - stripe_aggregate_xattr (local->xattr, dict); + stripe_aggregate_xattr (local->xattr, xdata); } local->stbuf_blocks += buf->ia_blocks; local->postparent_blocks += postparent->ia_blocks; + correct_file_size(buf, local->fctx, prev); + if (local->stbuf_size < buf->ia_size) local->stbuf_size = buf->ia_size; if (local->postparent_size < postparent->ia_size) @@ -387,11 +269,13 @@ stripe_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->stbuf.ia_size = local->stbuf_size; local->postparent.ia_blocks = local->postparent_blocks; local->postparent.ia_size = local->postparent_size; + inode_ctx_put (local->inode, this, + (uint64_t) (long)local->fctx); } STRIPE_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, local->inode, - &local->stbuf, local->dict, + &local->stbuf, local->xdata, &local->postparent); } out: @@ -400,14 +284,15 @@ out: int32_t stripe_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *xattr_req) + dict_t *xdata) { - stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; - stripe_private_t *priv = NULL; + stripe_local_t *local = NULL; + xlator_list_t *trav = NULL; + stripe_private_t *priv = NULL; int32_t op_errno = EINVAL; int64_t filesize = 0; - int ret = 0; + int ret = 0; + uint64_t tmpctx = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -419,8 +304,7 @@ stripe_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, trav = this->children; /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -429,10 +313,37 @@ stripe_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, frame->local = local; loc_copy (&local->loc, loc); - if (xattr_req && dict_get (xattr_req, GF_CONTENT_KEY)) { - ret = dict_get_int64 (xattr_req, GF_CONTENT_KEY, &filesize); + inode_ctx_get (local->inode, this, &tmpctx); + if (tmpctx) + local->fctx = (stripe_fd_ctx_t*) (long)tmpctx; + + /* quick-read friendly changes */ + if (xdata && dict_get (xdata, GF_CONTENT_KEY)) { + ret = dict_get_int64 (xdata, GF_CONTENT_KEY, &filesize); if (!ret && (filesize > priv->block_size)) - dict_del (xattr_req, GF_CONTENT_KEY); + dict_del (xdata, GF_CONTENT_KEY); + } + + /* get stripe-size xattr on lookup. This would be required for + * open/read/write/pathinfo calls. Hence we send down the request + * even when type == IA_INVAL */ + + /* + * We aren't guaranteed to have xdata here. We need the format info for + * the file, so allocate xdata if necessary. + */ + if (!xdata) + xdata = dict_new(); + else + xdata = dict_ref(xdata); + + if (xdata && (IA_ISREG (loc->inode->ia_type) || + (loc->inode->ia_type == IA_INVAL))) { + ret = stripe_xattr_request_build (this, xdata, 8, 4, 4, 0); + if (ret) + gf_log (this->name , GF_LOG_ERROR, "Failed to build" + " xattr request for %s", loc->path); + } /* Everytime in stripe lookup, all child nodes @@ -440,11 +351,12 @@ stripe_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, local->call_count = priv->child_count; while (trav) { STACK_WIND (frame, stripe_lookup_cbk, trav->xlator, - trav->xlator->fops->lookup, - loc, xattr_req); + trav->xlator->fops->lookup, loc, xdata); trav = trav->next; } + dict_unref(xdata); + return 0; err: STRIPE_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); @@ -454,7 +366,7 @@ err: int32_t stripe_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf) + int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -489,6 +401,9 @@ stripe_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } local->stbuf_blocks += buf->ia_blocks; + + correct_file_size(buf, local->fctx, prev); + if (local->stbuf_size < buf->ia_size) local->stbuf_size = buf->ia_size; } @@ -505,18 +420,19 @@ stripe_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } STRIPE_STACK_UNWIND (stat, frame, local->op_ret, - local->op_errno, &local->stbuf); + local->op_errno, &local->stbuf, NULL); } out: return 0; } int32_t -stripe_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) +stripe_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { xlator_list_t *trav = NULL; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; + stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = EINVAL; VALIDATE_OR_GOTO (frame, err); @@ -534,8 +450,7 @@ stripe_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -544,23 +459,30 @@ stripe_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) frame->local = local; local->call_count = priv->child_count; + if (IA_ISREG(loc->inode->ia_type)) { + inode_ctx_get(loc->inode, this, (uint64_t *) &fctx); + if (!fctx) + goto err; + local->fctx = fctx; + } + while (trav) { STACK_WIND (frame, stripe_stat_cbk, trav->xlator, - trav->xlator->fops->stat, loc); + trav->xlator->fops->stat, loc, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (stat, frame, -1, op_errno, NULL); + STRIPE_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); return 0; } int32_t stripe_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct statvfs *stbuf) + int32_t op_ret, int32_t op_errno, struct statvfs *stbuf, dict_t *xdata) { stripe_local_t *local = NULL; int32_t callcnt = 0; @@ -598,14 +520,14 @@ stripe_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (!callcnt) { STRIPE_STACK_UNWIND (statfs, frame, local->op_ret, - local->op_errno, &local->statvfs_buf); + local->op_errno, &local->statvfs_buf, NULL); } out: return 0; } int32_t -stripe_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) +stripe_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { stripe_local_t *local = NULL; xlator_list_t *trav = NULL; @@ -620,8 +542,7 @@ stripe_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) priv = this->private; /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -633,13 +554,13 @@ stripe_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) local->call_count = priv->child_count; while (trav) { STACK_WIND (frame, stripe_statfs_cbk, trav->xlator, - trav->xlator->fops->statfs, loc); + trav->xlator->fops->statfs, loc, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (statfs, frame, -1, op_errno, NULL); + STRIPE_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); return 0; } @@ -648,7 +569,7 @@ err: int32_t stripe_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -686,6 +607,9 @@ stripe_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->prebuf_blocks += prebuf->ia_blocks; local->postbuf_blocks += postbuf->ia_blocks; + correct_file_size(prebuf, local->fctx, prev); + correct_file_size(postbuf, local->fctx, prev); + if (local->prebuf_size < prebuf->ia_size) local->prebuf_size = prebuf->ia_size; @@ -708,19 +632,21 @@ stripe_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STRIPE_STACK_UNWIND (truncate, frame, local->op_ret, local->op_errno, &local->pre_buf, - &local->post_buf); + &local->post_buf, NULL); } out: return 0; } int32_t -stripe_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) +stripe_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, dict_t *xdata) { - xlator_list_t *trav = NULL; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; + stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = EINVAL; + int i, eof_idx; + off_t dest_offset, tmp_offset; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -729,7 +655,6 @@ stripe_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) VALIDATE_OR_GOTO (loc->inode, err); priv = this->private; - trav = this->children; if (priv->first_child_down) { op_errno = ENOTCONN; @@ -737,8 +662,7 @@ stripe_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -747,15 +671,55 @@ stripe_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) frame->local = local; local->call_count = priv->child_count; - while (trav) { - STACK_WIND (frame, stripe_truncate_cbk, trav->xlator, - trav->xlator->fops->truncate, loc, offset); - trav = trav->next; - } + inode_ctx_get(loc->inode, this, (uint64_t *) &fctx); + if (!fctx) { + gf_log(this->name, GF_LOG_ERROR, "no stripe context"); + op_errno = EINVAL; + goto err; + } + + local->fctx = fctx; + eof_idx = (offset / fctx->stripe_size) % fctx->stripe_count; + + for (i = 0; i < fctx->stripe_count; i++) { + if (!fctx->xl_array[i]) { + gf_log(this->name, GF_LOG_ERROR, + "no xlator at index %d", i); + op_errno = EINVAL; + goto err; + } + + if (fctx->stripe_coalesce) { + /* + * The node that owns EOF is truncated to the exact + * coalesced offset. Nodes prior to this index should + * be rounded up to the size of the complete stripe, + * while nodes after this index should be rounded down + * to the size of the previous stripe. + */ + if (i < eof_idx) + tmp_offset = roof(offset, fctx->stripe_size * + fctx->stripe_count); + else if (i > eof_idx) + tmp_offset = floor(offset, fctx->stripe_size * + fctx->stripe_count); + else + tmp_offset = offset; + + dest_offset = coalesced_offset(tmp_offset, + fctx->stripe_size, fctx->stripe_count); + } else { + dest_offset = offset; + } + + STACK_WIND(frame, stripe_truncate_cbk, fctx->xl_array[i], + fctx->xl_array[i]->fops->truncate, loc, dest_offset, + NULL); + } return 0; err: - STRIPE_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL); + STRIPE_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -763,7 +727,7 @@ err: int32_t stripe_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) + struct iatt *preop, struct iatt *postop, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -802,6 +766,9 @@ stripe_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->prebuf_blocks += preop->ia_blocks; local->postbuf_blocks += postop->ia_blocks; + correct_file_size(preop, local->fctx, prev); + correct_file_size(postop, local->fctx, prev); + if (local->prebuf_size < preop->ia_size) local->prebuf_size = preop->ia_size; if (local->postbuf_size < postop->ia_size) @@ -823,7 +790,7 @@ stripe_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STRIPE_STACK_UNWIND (setattr, frame, local->op_ret, local->op_errno, &local->pre_buf, - &local->post_buf); + &local->post_buf, NULL); } out: return 0; @@ -832,11 +799,12 @@ out: int32_t stripe_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid) + struct iatt *stbuf, int32_t valid, dict_t *xdata) { xlator_list_t *trav = NULL; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; + stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = EINVAL; VALIDATE_OR_GOTO (frame, err); @@ -854,33 +822,47 @@ stripe_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } local->op_ret = -1; frame->local = local; - local->call_count = priv->child_count; + if (!IA_ISDIR (loc->inode->ia_type) && + !IA_ISREG (loc->inode->ia_type)) { + local->call_count = 1; + STACK_WIND (frame, stripe_setattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setattr, + loc, stbuf, valid, NULL); + return 0; + } + + if (IA_ISREG(loc->inode->ia_type)) { + inode_ctx_get(loc->inode, this, (uint64_t *) &fctx); + if (!fctx) + goto err; + local->fctx = fctx; + } + local->call_count = priv->child_count; while (trav) { STACK_WIND (frame, stripe_setattr_cbk, trav->xlator, trav->xlator->fops->setattr, - loc, stbuf, valid); + loc, stbuf, valid, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL); + STRIPE_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); return 0; } int32_t stripe_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid) + struct iatt *stbuf, int32_t valid, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; @@ -896,8 +878,7 @@ stripe_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, trav = this->children; /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -908,13 +889,13 @@ stripe_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, while (trav) { STACK_WIND (frame, stripe_setattr_cbk, trav->xlator, - trav->xlator->fops->fsetattr, fd, stbuf, valid); + trav->xlator->fops->fsetattr, fd, stbuf, valid, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL); + STRIPE_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -922,7 +903,8 @@ int32_t stripe_stack_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -959,6 +941,8 @@ stripe_stack_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->pre_buf.ia_blocks += prenewparent->ia_blocks; local->post_buf.ia_blocks += postnewparent->ia_blocks; + correct_file_size(buf, local->fctx, prev); + if (local->stbuf.ia_size < buf->ia_size) local->stbuf.ia_size = buf->ia_size; @@ -984,7 +968,7 @@ stripe_stack_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STRIPE_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, &local->stbuf, &local->preparent, &local->postparent, &local->pre_buf, - &local->post_buf); + &local->post_buf, NULL); } out: return 0; @@ -994,7 +978,8 @@ int32_t stripe_first_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { stripe_local_t *local = NULL; xlator_list_t *trav = NULL; @@ -1025,24 +1010,25 @@ stripe_first_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, while (trav) { STACK_WIND (frame, stripe_stack_rename_cbk, trav->xlator, trav->xlator->fops->rename, - &local->loc, &local->loc2); + &local->loc, &local->loc2, NULL); trav = trav->next; } return 0; unwind: STRIPE_STACK_UNWIND (rename, frame, -1, op_errno, buf, preoldparent, - postoldparent, prenewparent, postnewparent); + postoldparent, prenewparent, postnewparent, NULL); return 0; } int32_t stripe_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc) + loc_t *newloc, dict_t *xdata) { stripe_private_t *priv = NULL; stripe_local_t *local = NULL; xlator_list_t *trav = NULL; + stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = EINVAL; VALIDATE_OR_GOTO (frame, err); @@ -1062,8 +1048,7 @@ stripe_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -1074,21 +1059,28 @@ stripe_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, local->call_count = priv->child_count; + if (IA_ISREG(oldloc->inode->ia_type)) { + inode_ctx_get(oldloc->inode, this, (uint64_t *) &fctx); + if (!fctx) + goto err; + local->fctx = fctx; + } + frame->local = local; STACK_WIND (frame, stripe_first_rename_cbk, trav->xlator, - trav->xlator->fops->rename, oldloc, newloc); + trav->xlator->fops->rename, oldloc, newloc, NULL); return 0; err: STRIPE_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, - NULL, NULL); + NULL, NULL, NULL); return 0; } int32_t stripe_first_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { stripe_local_t *local = NULL; call_frame_t *prev = NULL; @@ -1113,10 +1105,10 @@ stripe_first_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->postparent_blocks += postparent->ia_blocks; STRIPE_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, xdata); return 0; out: - STRIPE_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL); + STRIPE_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -1127,7 +1119,7 @@ out: int32_t stripe_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -1163,17 +1155,19 @@ stripe_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } STACK_WIND(frame, stripe_first_unlink_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->unlink, &local->loc); + FIRST_CHILD (this)->fops->unlink, &local->loc, + local->xflag, local->xdata); } return 0; out: - STRIPE_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL); + STRIPE_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); return 0; } int32_t -stripe_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) +stripe_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, + int xflag, dict_t *xdata) { xlator_list_t *trav = NULL; stripe_local_t *local = NULL; @@ -1201,14 +1195,18 @@ stripe_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } local->op_ret = -1; loc_copy (&local->loc, loc); + local->xflag = xflag; + + if (xdata) + local->xdata = dict_ref (xdata); + frame->local = local; local->call_count = priv->child_count; trav = trav->next; /* Skip the first child */ @@ -1216,13 +1214,13 @@ stripe_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) while (trav) { STACK_WIND (frame, stripe_unlink_cbk, trav->xlator, trav->xlator->fops->unlink, - loc); + loc, xflag, xdata); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL); + STRIPE_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -1230,8 +1228,7 @@ err: int32_t stripe_first_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno,struct iatt *preparent, - struct iatt *postparent) - + struct iatt *postparent, dict_t *xdata) { stripe_local_t *local = NULL; @@ -1258,10 +1255,10 @@ stripe_first_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->postparent_blocks += postparent->ia_blocks; STRIPE_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, xdata); return 0; err: - STRIPE_STACK_UNWIND (rmdir, frame, op_ret, op_errno, NULL, NULL); + STRIPE_STACK_UNWIND (rmdir, frame, op_ret, op_errno, NULL, NULL, NULL); return 0; } @@ -1269,7 +1266,7 @@ err: int32_t stripe_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -1301,16 +1298,16 @@ stripe_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; STACK_WIND (frame, stripe_first_rmdir_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->rmdir, &local->loc, - local->flags); + local->flags, NULL); } return 0; out: - STRIPE_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL); + STRIPE_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL); return 0; } int32_t -stripe_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) +stripe_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, dict_t *xdata) { xlator_list_t *trav = NULL; stripe_local_t *local = NULL; @@ -1333,8 +1330,7 @@ stripe_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -1348,13 +1344,13 @@ stripe_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) while (trav) { STACK_WIND (frame, stripe_rmdir_cbk, trav->xlator, - trav->xlator->fops->rmdir, loc, flags); + trav->xlator->fops->rmdir, loc, flags, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL); + STRIPE_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -1363,7 +1359,7 @@ int32_t stripe_mknod_ifreg_fail_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -1384,7 +1380,7 @@ stripe_mknod_ifreg_fail_unlink_cbk (call_frame_t *frame, void *cookie, if (!callcnt) { STRIPE_STACK_UNWIND (mknod, frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, NULL); } out: return 0; @@ -1396,7 +1392,7 @@ out: int32_t stripe_mknod_ifreg_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) + int32_t op_errno, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -1435,7 +1431,7 @@ stripe_mknod_ifreg_setxattr_cbk (call_frame_t *frame, void *cookie, stripe_mknod_ifreg_fail_unlink_cbk, trav->xlator, trav->xlator->fops->unlink, - &local->loc); + &local->loc, 0, NULL); trav = trav->next; } return 0; @@ -1443,7 +1439,7 @@ stripe_mknod_ifreg_setxattr_cbk (call_frame_t *frame, void *cookie, STRIPE_STACK_UNWIND (mknod, frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, NULL); } out: return 0; @@ -1453,7 +1449,7 @@ int32_t stripe_mknod_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -1491,10 +1487,16 @@ stripe_mknod_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (uuid_is_null (local->ia_gfid)) uuid_copy (local->ia_gfid, buf->ia_gfid); + if (stripe_ctx_handle(this, prev, local, xdata)) + gf_log(this->name, GF_LOG_ERROR, + "Error getting fctx info from dict"); + local->stbuf_blocks += buf->ia_blocks; local->preparent_blocks += preparent->ia_blocks; local->postparent_blocks += postparent->ia_blocks; + correct_file_size(buf, local->fctx, prev); + if (local->stbuf_size < buf->ia_size) local->stbuf_size = buf->ia_size; if (local->preparent_size < preparent->ia_size) @@ -1519,7 +1521,7 @@ stripe_mknod_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, stripe_mknod_ifreg_fail_unlink_cbk, trav->xlator, trav->xlator->fops->unlink, - &local->loc); + &local->loc, 0, NULL); trav = trav->next; } return 0; @@ -1533,13 +1535,13 @@ stripe_mknod_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->postparent.ia_size = local->postparent_size; local->stbuf.ia_size = local->stbuf_size; local->stbuf.ia_blocks = local->stbuf_blocks; - } + inode_ctx_put (local->inode, this, + (uint64_t)(long) local->fctx); - /* Create itself has failed.. so return - without setxattring */ + } STRIPE_STACK_UNWIND (mknod, frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, NULL); } out: return 0; @@ -1550,16 +1552,13 @@ int32_t stripe_mknod_first_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; call_frame_t *prev = NULL; xlator_list_t *trav = NULL; int i = 1; - char size_key[256] = {0,}; - char index_key[256] = {0,}; - char count_key[256] = {0,}; dict_t *dict = NULL; int ret = 0; int need_unref = 0; @@ -1599,10 +1598,6 @@ stripe_mknod_first_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->stbuf.ia_size = local->stbuf_size; local->stbuf.ia_blocks = local->stbuf_blocks; - sprintf (size_key, "trusted.%s.stripe-size", this->name); - sprintf (count_key, "trusted.%s.stripe-count", this->name); - sprintf (index_key, "trusted.%s.stripe-index", this->name); - trav = trav->next; while (trav) { if (priv->xattr_supported) { @@ -1615,26 +1610,21 @@ stripe_mknod_first_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dict_copy (local->xattr, dict); - ret = dict_set_int64 (dict, size_key, local->stripe_size); + ret = stripe_xattr_request_build (this, dict, + local->stripe_size, + priv->child_count, i, + priv->coalesce); if (ret) gf_log (this->name, GF_LOG_ERROR, - "%s: set stripe-size failed", local->loc.path); - ret = dict_set_int32 (dict, count_key, priv->child_count); + "Failed to build xattr request"); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "%s: set child_count failed", local->loc.path); - ret = dict_set_int32 (dict, index_key, i); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "%s: set stripe-index failed", local->loc.path); } else { dict = local->xattr; } STACK_WIND (frame, stripe_mknod_ifreg_cbk, trav->xlator, trav->xlator->fops->mknod, - &local->loc, local->mode, local->rdev, dict); + &local->loc, local->mode, local->rdev, 0, dict); trav = trav->next; i++; @@ -1646,7 +1636,7 @@ stripe_mknod_first_ifreg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, out: - STRIPE_STACK_UNWIND (mknod, frame, op_ret, op_errno, NULL, NULL, NULL, NULL); + STRIPE_STACK_UNWIND (mknod, frame, op_ret, op_errno, NULL, NULL, NULL, NULL, NULL); return 0; } @@ -1655,25 +1645,22 @@ int32_t stripe_single_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { STRIPE_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, buf, - preparent, postparent); + preparent, postparent, xdata); return 0; } int stripe_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t rdev, dict_t *params) + dev_t rdev, mode_t umask, dict_t *xdata) { stripe_private_t *priv = NULL; stripe_local_t *local = NULL; int32_t op_errno = EINVAL; int32_t i = 0; - char size_key[256] = {0,}; - char index_key[256] = {0,}; - char count_key[256] = {0,}; dict_t *dict = NULL; int ret = 0; int need_unref = 0; @@ -1703,37 +1690,26 @@ stripe_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } local->op_ret = -1; local->op_errno = ENOTCONN; - local->stripe_size = stripe_get_matching_bs (loc->path, - priv->pattern, - priv->block_size); + local->stripe_size = stripe_get_matching_bs (loc->path, priv); frame->local = local; local->inode = inode_ref (loc->inode); loc_copy (&local->loc, loc); - local->xattr = dict_copy_with_ref (params, NULL); + local->xattr = dict_copy_with_ref (xdata, NULL); local->mode = mode; + local->umask = umask; local->rdev = rdev; /* Everytime in stripe lookup, all child nodes should be looked up */ local->call_count = priv->child_count; - /* Send a setxattr request to nodes where the - files are created */ - sprintf (size_key, - "trusted.%s.stripe-size", this->name); - sprintf (count_key, - "trusted.%s.stripe-count", this->name); - sprintf (index_key, - "trusted.%s.stripe-index", this->name); - if (priv->xattr_supported) { dict = dict_new (); if (!dict) { @@ -1742,29 +1718,22 @@ stripe_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, } need_unref = 1; - dict_copy (params, dict); + dict_copy (xdata, dict); - ret = dict_set_int64 (dict, size_key, - local->stripe_size); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "%s: set stripe-size failed", loc->path); - ret = dict_set_int32 (dict, count_key, - priv->child_count); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "%s: set child_count failed", loc->path); - ret = dict_set_int32 (dict, index_key, i); + ret = stripe_xattr_request_build (this, dict, + local->stripe_size, + priv->child_count, + i, priv->coalesce); if (ret) gf_log (this->name, GF_LOG_ERROR, - "%s: set stripe-index failed", loc->path); + "failed to build xattr request"); } else { - dict = params; + dict = xdata; } STACK_WIND (frame, stripe_mknod_first_ifreg_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->mknod, - loc, mode, rdev, dict); + loc, mode, rdev, umask, dict); if (dict && need_unref) dict_unref (dict); @@ -1773,11 +1742,11 @@ stripe_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, STACK_WIND (frame, stripe_single_mknod_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, - loc, mode, rdev, params); + loc, mode, rdev, umask, xdata); return 0; err: - STRIPE_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL); + STRIPE_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); return 0; } @@ -1786,7 +1755,7 @@ int32_t stripe_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -1843,7 +1812,7 @@ stripe_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STRIPE_STACK_UNWIND (mkdir, frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, &local->preparent, - &local->postparent); + &local->postparent, NULL); } out: return 0; @@ -1852,9 +1821,9 @@ out: int32_t stripe_first_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { stripe_local_t *local = NULL; call_frame_t *prev = NULL; @@ -1885,7 +1854,7 @@ stripe_first_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->stbuf = *buf; local->postparent = *postparent; local->preparent = *preparent; - + local->stbuf_blocks += buf->ia_blocks; local->preparent_blocks += preparent->ia_blocks; local->postparent_blocks += postparent->ia_blocks; @@ -1897,13 +1866,13 @@ stripe_first_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, while (trav) { STACK_WIND (frame, stripe_mkdir_cbk, trav->xlator, trav->xlator->fops->mkdir, &local->loc, local->mode, - local->dict); + local->umask, local->xdata); trav = trav->next; } return 0; out: STRIPE_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, - NULL); + NULL, NULL); return 0; @@ -1912,7 +1881,7 @@ out: int stripe_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dict_t *params) + mode_t umask, dict_t *xdata) { stripe_private_t *priv = NULL; stripe_local_t *local = NULL; @@ -1934,26 +1903,27 @@ stripe_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } local->op_ret = -1; local->call_count = priv->child_count; - local->dict = dict_ref (params); - local->mode = mode; + if (xdata) + local->xdata = dict_ref (xdata); + local->mode = mode; + local->umask = umask; loc_copy (&local->loc, loc); frame->local = local; /* Everytime in stripe lookup, all child nodes should be looked up */ STACK_WIND (frame, stripe_first_mkdir_cbk, trav->xlator, - trav->xlator->fops->mkdir, loc, mode, params); + trav->xlator->fops->mkdir, loc, mode, umask, xdata); return 0; err: - STRIPE_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL); + STRIPE_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); return 0; } @@ -1962,11 +1932,12 @@ int32_t stripe_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; call_frame_t *prev = NULL; + stripe_fd_ctx_t *fctx = NULL; if (!this || !frame || !frame->local || !cookie) { gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); @@ -1993,6 +1964,16 @@ stripe_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret >= 0) { local->op_ret = 0; + if (IA_ISREG(inode->ia_type)) { + inode_ctx_get(inode, this, (uint64_t *) &fctx); + if (!fctx) { + gf_log(this->name, GF_LOG_ERROR, + "failed to get stripe context"); + op_ret = -1; + op_errno = EINVAL; + } + } + if (FIRST_CHILD(this) == prev->this) { local->inode = inode_ref (inode); local->stbuf = *buf; @@ -2003,6 +1984,8 @@ stripe_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->preparent_blocks += preparent->ia_blocks; local->postparent_blocks += postparent->ia_blocks; + correct_file_size(buf, fctx, prev); + if (local->stbuf_size < buf->ia_size) local->stbuf_size = buf->ia_size; if (local->preparent_size < preparent->ia_size) @@ -2028,14 +2011,14 @@ stripe_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STRIPE_STACK_UNWIND (link, frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, &local->preparent, - &local->postparent); + &local->postparent, NULL); } out: return 0; } int32_t -stripe_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc) +stripe_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, dict_t *xdata) { xlator_list_t *trav = NULL; stripe_local_t *local = NULL; @@ -2058,8 +2041,7 @@ stripe_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc) } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -2073,13 +2055,13 @@ stripe_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc) while (trav) { STACK_WIND (frame, stripe_link_cbk, trav->xlator, trav->xlator->fops->link, - oldloc, newloc); + oldloc, newloc, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL); + STRIPE_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); return 0; } @@ -2087,7 +2069,7 @@ int32_t stripe_create_fail_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -2108,7 +2090,7 @@ stripe_create_fail_unlink_cbk (call_frame_t *frame, void *cookie, if (!callcnt) { STRIPE_STACK_UNWIND (create, frame, local->op_ret, local->op_errno, local->fd, local->inode, &local->stbuf, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, NULL); } out: return 0; @@ -2119,12 +2101,11 @@ int32_t stripe_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; stripe_private_t *priv = NULL; - stripe_fd_ctx_t *fctx = NULL; call_frame_t *prev = NULL; xlator_list_t *trav = NULL; @@ -2150,12 +2131,21 @@ stripe_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } if (op_ret >= 0) { + if (IA_ISREG(buf->ia_type)) { + if (stripe_ctx_handle(this, prev, local, xdata)) + gf_log(this->name, GF_LOG_ERROR, + "Error getting fctx info from " + "dict"); + } + local->op_ret = op_ret; local->stbuf_blocks += buf->ia_blocks; local->preparent_blocks += preparent->ia_blocks; local->postparent_blocks += postparent->ia_blocks; + correct_file_size(buf, local->fctx, prev); + if (local->stbuf_size < buf->ia_size) local->stbuf_size = buf->ia_size; if (local->preparent_size < preparent->ia_size) @@ -2178,7 +2168,7 @@ stripe_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, stripe_create_fail_unlink_cbk, trav->xlator, trav->xlator->fops->unlink, - &local->loc); + &local->loc, 0, NULL); trav = trav->next; } @@ -2193,29 +2183,19 @@ stripe_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->stbuf.ia_size = local->stbuf_size; local->stbuf.ia_blocks = local->stbuf_blocks; - fctx = GF_CALLOC (1, sizeof (stripe_fd_ctx_t), - gf_stripe_mt_stripe_fd_ctx_t); - if (!fctx) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unwind; - } - - fctx->stripe_size = local->stripe_size; - fctx->stripe_count = priv->child_count; - fctx->static_array = 1; - fctx->xl_array = priv->xl_array; - fd_ctx_set (local->fd, this, - (uint64_t)(long)fctx); + stripe_copy_xl_array(local->fctx->xl_array, + priv->xl_array, + local->fctx->stripe_count); + inode_ctx_put(local->inode, this, + (uint64_t) local->fctx); } - unwind: /* Create itself has failed.. so return without setxattring */ STRIPE_STACK_UNWIND (create, frame, local->op_ret, local->op_errno, local->fd, local->inode, &local->stbuf, - &local->preparent, &local->postparent); + &local->preparent, &local->postparent, NULL); } out: @@ -2228,7 +2208,7 @@ int32_t stripe_first_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; @@ -2239,9 +2219,6 @@ stripe_first_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, loc_t *loc = NULL; int32_t need_unref = 0; int32_t ret = -1; - char size_key[256] = {0,}; - char index_key[256] = {0,}; - char count_key[256] = {0,}; if (!this || !frame || !frame->local || !cookie) { gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); @@ -2288,7 +2265,7 @@ stripe_first_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->call_count = 1; STACK_WIND (frame, stripe_create_fail_unlink_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->unlink, - &local->loc); + &local->loc, 0, NULL); return 0; } @@ -2303,9 +2280,6 @@ stripe_first_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, /* Send a setxattr request to nodes where the files are created */ - sprintf (size_key, "trusted.%s.stripe-size", this->name); - sprintf (count_key, "trusted.%s.stripe-count", this->name); - sprintf (index_key, "trusted.%s.stripe-index", this->name); trav = trav->next; while (trav) { if (priv->xattr_supported) { @@ -2318,27 +2292,20 @@ stripe_first_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dict_copy (local->xattr, dict); - ret = dict_set_int64 (dict, size_key, - local->stripe_size); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "%s: set stripe-size failed", loc->path); - ret = dict_set_int32 (dict, count_key, - priv->child_count); + ret = stripe_xattr_request_build (this, dict, + local->stripe_size, + priv->child_count, + i, priv->coalesce); if (ret) gf_log (this->name, GF_LOG_ERROR, - "%s: set child_count failed", loc->path); - ret = dict_set_int32 (dict, index_key, i); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "%s: set stripe-index failed", loc->path); + "failed to build xattr request"); } else { dict = local->xattr; } - + STACK_WIND (frame, stripe_create_cbk, trav->xlator, trav->xlator->fops->create, &local->loc, - local->flags, local->mode, local->fd, + local->flags, local->mode, local->umask, local->fd, dict); trav = trav->next; if (need_unref && dict) @@ -2360,7 +2327,7 @@ out: */ int32_t stripe_create (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flags, mode_t mode, fd_t *fd, dict_t *params) + int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { stripe_private_t *priv = NULL; stripe_local_t *local = NULL; @@ -2368,9 +2335,6 @@ stripe_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int ret = 0; int need_unref = 0; int i = 0; - char size_key[256] = {0,}; - char index_key[256] = {0,}; - char count_key[256] = {0,}; dict_t *dict = NULL; VALIDATE_OR_GOTO (frame, err); @@ -2392,31 +2356,27 @@ stripe_create (call_frame_t *frame, xlator_t *this, loc_t *loc, } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } local->op_ret = -1; local->op_errno = ENOTCONN; - local->stripe_size = stripe_get_matching_bs (loc->path, - priv->pattern, - priv->block_size); + local->stripe_size = stripe_get_matching_bs (loc->path, priv); frame->local = local; local->inode = inode_ref (loc->inode); loc_copy (&local->loc, loc); local->fd = fd_ref (fd); local->flags = flags; local->mode = mode; - local->xattr = dict_copy_with_ref (params, NULL); + local->umask = umask; + if (xdata) + local->xattr = dict_ref (xdata); local->call_count = priv->child_count; /* Send a setxattr request to nodes where the files are created */ - sprintf (size_key, "trusted.%s.stripe-size", this->name); - sprintf (count_key, "trusted.%s.stripe-count", this->name); - sprintf (index_key, "trusted.%s.stripe-index", this->name); if (priv->xattr_supported) { dict = dict_new (); @@ -2426,30 +2386,23 @@ stripe_create (call_frame_t *frame, xlator_t *this, loc_t *loc, } need_unref = 1; - dict_copy (params, dict); + dict_copy (xdata, dict); - ret = dict_set_int64 (dict, size_key, - local->stripe_size); + ret = stripe_xattr_request_build (this, dict, + local->stripe_size, + priv->child_count, + i, priv->coalesce); if (ret) gf_log (this->name, GF_LOG_ERROR, - "%s: set stripe-size failed", loc->path); - ret = dict_set_int32 (dict, count_key, - priv->child_count); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "%s: set child_count failed", loc->path); - ret = dict_set_int32 (dict, index_key, i); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "%s: set stripe-index failed", loc->path); + "failed to build xattr request"); } else { - dict = params; + dict = xdata; } STACK_WIND (frame, stripe_first_create_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->create, loc, flags, mode, - fd, dict); + umask, fd, dict); if (need_unref && dict) dict_unref (dict); @@ -2458,13 +2411,13 @@ stripe_create (call_frame_t *frame, xlator_t *this, loc_t *loc, return 0; err: STRIPE_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, - NULL, NULL); + NULL, NULL, xdata); return 0; } int32_t stripe_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -2502,224 +2455,25 @@ stripe_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (local->failed) local->op_ret = -1; - if (local->op_ret == -1) { - if (local->fctx) { - if (!local->fctx->static_array) - GF_FREE (local->fctx->xl_array); - GF_FREE (local->fctx); - } - } else { - fd_ctx_set (local->fd, this, - (uint64_t)(long)local->fctx); - } - STRIPE_STACK_UNWIND (open, frame, local->op_ret, - local->op_errno, local->fd); + local->op_errno, local->fd, xdata); } out: return 0; } -int32_t -stripe_open_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *dict, struct iatt *postparent) -{ - int32_t index = 0; - int32_t callcnt = 0; - char key[256] = {0,}; - stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; - stripe_private_t *priv = NULL; - data_t *data = NULL; - call_frame_t *prev = NULL; - - if (!this || !frame || !frame->local || !cookie) { - gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); - goto out; - } - - prev = (call_frame_t *)cookie; - priv = this->private; - local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - local->op_ret = -1; - if (local->op_errno != EIO) - local->op_errno = op_errno; - if ((op_errno != ENOENT) || - (prev->this == FIRST_CHILD (this))) - local->failed = 1; - goto unlock; - } - - if (!dict) - goto unlock; - - if (!local->fctx) { - local->fctx = GF_CALLOC (1, sizeof (stripe_fd_ctx_t), - gf_stripe_mt_stripe_fd_ctx_t); - if (!local->fctx) { - local->op_errno = ENOMEM; - local->op_ret = -1; - goto unlock; - } - - local->fctx->static_array = 0; - } - /* Stripe block size */ - sprintf (key, "trusted.%s.stripe-size", this->name); - data = dict_get (dict, key); - if (!data) { - local->xattr_self_heal_needed = 1; - } else { - if (!local->fctx->stripe_size) { - local->fctx->stripe_size = - data_to_int64 (data); - } - - if (local->fctx->stripe_size != data_to_int64 (data)) { - gf_log (this->name, GF_LOG_WARNING, - "stripe-size mismatch in blocks"); - local->xattr_self_heal_needed = 1; - } - } - /* Stripe count */ - sprintf (key, "trusted.%s.stripe-count", this->name); - data = dict_get (dict, key); - if (!data) { - local->xattr_self_heal_needed = 1; - goto unlock; - } - if (!local->fctx->xl_array) { - local->fctx->stripe_count = data_to_int32 (data); - if (!local->fctx->stripe_count) { - gf_log (this->name, GF_LOG_ERROR, - "error with stripe-count xattr"); - local->op_ret = -1; - local->op_errno = EIO; - goto unlock; - } - - local->fctx->xl_array = - GF_CALLOC (local->fctx->stripe_count, - sizeof (xlator_t *), - gf_stripe_mt_xlator_t); - if (!local->fctx->xl_array) { - local->op_errno = ENOMEM; - local->op_ret = -1; - goto unlock; - } - } - if (local->fctx->stripe_count != data_to_int32 (data)) { - gf_log (this->name, GF_LOG_ERROR, - "error with stripe-count xattr (%d != %d)", - local->fctx->stripe_count, data_to_int32 (data)); - local->op_ret = -1; - local->op_errno = EIO; - goto unlock; - } - - /* index */ - sprintf (key, "trusted.%s.stripe-index", this->name); - data = dict_get (dict, key); - if (!data) { - local->xattr_self_heal_needed = 1; - goto unlock; - } - index = data_to_int32 (data); - if (index > priv->child_count) { - gf_log (this->name, GF_LOG_ERROR, - "error with stripe-index xattr (%d)", index); - local->op_ret = -1; - local->op_errno = EIO; - goto unlock; - } - if (local->fctx->xl_array) { - if (local->fctx->xl_array[index]) { - gf_log (this->name, GF_LOG_ERROR, - "duplicate entry @ index (%d)", index); - local->op_ret = -1; - local->op_errno = EIO; - goto unlock; - } - local->fctx->xl_array[index] = prev->this; - } - local->entry_count++; - local->op_ret = 0; - } -unlock: - UNLOCK (&frame->lock); - - if (!callcnt) { - /* TODO: if self-heal flag is set, do it */ - if (local->xattr_self_heal_needed) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: stripe info need to be healed", - local->loc.path); - } - - if (local->failed) - local->op_ret = -1; - - if (local->op_ret) - goto err; - - if (local->entry_count != local->fctx->stripe_count) { - gf_log (this->name, GF_LOG_ERROR, - "entry-count (%d) != stripe-count (%d)", - local->entry_count, local->fctx->stripe_count); - local->op_ret = -1; - local->op_errno = EIO; - goto err; - } - if (!local->fctx->stripe_size) { - gf_log (this->name, GF_LOG_ERROR, "stripe size not set"); - local->op_ret = -1; - local->op_errno = EIO; - goto err; - } - - local->call_count = local->fctx->stripe_count; - - trav = this->children; - while (trav) { - STACK_WIND (frame, stripe_open_cbk, trav->xlator, - trav->xlator->fops->open, &local->loc, - local->flags, local->fd, 0); - trav = trav->next; - } - } - - return 0; -err: - STRIPE_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, - local->fd); -out: - return 0; -} /** * stripe_open - */ int32_t stripe_open (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flags, fd_t *fd, int32_t wbflags) + int32_t flags, fd_t *fd, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; xlator_list_t *trav = NULL; int32_t op_errno = 1; - dict_t *dict = NULL; - int ret = 0; - char key[256] = {0,}; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -2736,8 +2490,7 @@ stripe_open (call_frame_t *frame, xlator_t *this, loc_t *loc, } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -2753,73 +2506,25 @@ stripe_open (call_frame_t *frame, xlator_t *this, loc_t *loc, /* Striped files */ local->flags = flags; local->call_count = priv->child_count; - local->stripe_size = stripe_get_matching_bs (loc->path, - priv->pattern, - priv->block_size); - - if (priv->xattr_supported) { - dict = dict_new (); - if (!dict) - goto err; - - sprintf (key, "trusted.%s.stripe-size", this->name); - ret = dict_set_int64 (dict, key, 8); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "failed to set %s in xattr_req dict", key); - - sprintf (key, "trusted.%s.stripe-count", this->name); - ret = dict_set_int32 (dict, key, 4); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "failed to set %s in xattr_req dict", key); - - sprintf (key, "trusted.%s.stripe-index", this->name); - ret = dict_set_int32 (dict, key, 4); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "failed to set %s in xattr_req dict", key); - - while (trav) { - STACK_WIND (frame, stripe_open_lookup_cbk, - trav->xlator, trav->xlator->fops->lookup, - loc, dict); - trav = trav->next; - } - if (dict) - dict_unref (dict); - - return 0; - } - local->fctx = GF_CALLOC (1, sizeof (stripe_fd_ctx_t), - gf_stripe_mt_stripe_fd_ctx_t); - if (!local->fctx) { - op_errno = ENOMEM; - goto err; - } - - local->fctx->static_array = 1; - local->fctx->stripe_size = local->stripe_size; - local->fctx->stripe_count = priv->child_count; - local->fctx->xl_array = priv->xl_array; + local->stripe_size = stripe_get_matching_bs (loc->path, priv); while (trav) { STACK_WIND (frame, stripe_open_cbk, trav->xlator, trav->xlator->fops->open, &local->loc, local->flags, local->fd, - wbflags); + xdata); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (open, frame, -1, op_errno, NULL); + STRIPE_STACK_UNWIND (open, frame, -1, op_errno, NULL, NULL); return 0; } int32_t stripe_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -2852,7 +2557,7 @@ stripe_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (!callcnt) { STRIPE_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); + local->op_errno, local->fd, NULL); } out: return 0; @@ -2860,7 +2565,7 @@ out: int32_t -stripe_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) +stripe_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, dict_t *xdata) { xlator_list_t *trav = NULL; stripe_local_t *local = NULL; @@ -2882,8 +2587,7 @@ stripe_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -2894,19 +2598,19 @@ stripe_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) while (trav) { STACK_WIND (frame, stripe_opendir_cbk, trav->xlator, - trav->xlator->fops->opendir, loc, fd); + trav->xlator->fops->opendir, loc, fd, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (opendir, frame, -1, op_errno, NULL); + STRIPE_STACK_UNWIND (opendir, frame, -1, op_errno, NULL, NULL); return 0; } int32_t stripe_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -2946,7 +2650,7 @@ stripe_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (local->failed) local->op_ret = -1; STRIPE_STACK_UNWIND (lk, frame, local->op_ret, - local->op_errno, &local->lock); + local->op_errno, &local->lock, NULL); } out: return 0; @@ -2954,7 +2658,7 @@ out: int32_t stripe_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, - struct gf_flock *lock) + struct gf_flock *lock, dict_t *xdata) { stripe_local_t *local = NULL; xlator_list_t *trav = NULL; @@ -2970,8 +2674,7 @@ stripe_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, priv = this->private; /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -2982,20 +2685,20 @@ stripe_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, while (trav) { STACK_WIND (frame, stripe_lk_cbk, trav->xlator, - trav->xlator->fops->lk, fd, cmd, lock); + trav->xlator->fops->lk, fd, cmd, lock, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (lk, frame, -1, op_errno, NULL); + STRIPE_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); return 0; } int32_t stripe_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -3032,14 +2735,14 @@ stripe_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_ret = -1; STRIPE_STACK_UNWIND (flush, frame, local->op_ret, - local->op_errno); + local->op_errno, NULL); } out: return 0; } int32_t -stripe_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +stripe_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; @@ -3059,8 +2762,7 @@ stripe_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) goto err; } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -3071,13 +2773,13 @@ stripe_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) while (trav) { STACK_WIND (frame, stripe_flush_cbk, trav->xlator, - trav->xlator->fops->flush, fd); + trav->xlator->fops->flush, fd, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (flush, frame, -1, op_errno); + STRIPE_STACK_UNWIND (flush, frame, -1, op_errno, NULL); return 0; } @@ -3086,7 +2788,7 @@ err: int32_t stripe_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -3122,6 +2824,9 @@ stripe_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->prebuf_blocks += prebuf->ia_blocks; local->postbuf_blocks += postbuf->ia_blocks; + correct_file_size(prebuf, local->fctx, prev); + correct_file_size(postbuf, local->fctx, prev); + if (local->prebuf_size < prebuf->ia_size) local->prebuf_size = prebuf->ia_size; @@ -3144,18 +2849,19 @@ stripe_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STRIPE_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno, &local->pre_buf, - &local->post_buf); + &local->post_buf, NULL); } out: return 0; } int32_t -stripe_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags) +stripe_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; xlator_list_t *trav = NULL; + stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = 1; VALIDATE_OR_GOTO (frame, err); @@ -3167,31 +2873,38 @@ stripe_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags) trav = this->children; /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } + + inode_ctx_get(fd->inode, this, (uint64_t *) &fctx); + if (!fctx) { + op_errno = EINVAL; + goto err; + } + local->fctx = fctx; + local->op_ret = -1; frame->local = local; local->call_count = priv->child_count; while (trav) { STACK_WIND (frame, stripe_fsync_cbk, trav->xlator, - trav->xlator->fops->fsync, fd, flags); + trav->xlator->fops->fsync, fd, flags, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL); + STRIPE_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); return 0; } int32_t stripe_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf) + int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -3226,6 +2939,9 @@ stripe_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->stbuf = *buf; local->stbuf_blocks += buf->ia_blocks; + + correct_file_size(buf, local->fctx, prev); + if (local->stbuf_size < buf->ia_size) local->stbuf_size = buf->ia_size; } @@ -3242,7 +2958,7 @@ stripe_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } STRIPE_STACK_UNWIND (fstat, frame, local->op_ret, - local->op_errno, &local->stbuf); + local->op_errno, &local->stbuf, NULL); } out: @@ -3252,11 +2968,12 @@ out: int32_t stripe_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd) + fd_t *fd, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; xlator_list_t *trav = NULL; + stripe_fd_ctx_t *fctx = NULL; int32_t op_errno = 1; VALIDATE_OR_GOTO (frame, err); @@ -3268,8 +2985,7 @@ stripe_fstat (call_frame_t *frame, trav = this->children; /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -3278,26 +2994,35 @@ stripe_fstat (call_frame_t *frame, frame->local = local; local->call_count = priv->child_count; + if (IA_ISREG(fd->inode->ia_type)) { + inode_ctx_get(fd->inode, this, (uint64_t *) &fctx); + if (!fctx) + goto err; + local->fctx = fctx; + } + while (trav) { STACK_WIND (frame, stripe_fstat_cbk, trav->xlator, - trav->xlator->fops->fstat, fd); + trav->xlator->fops->fstat, fd, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (fstat, frame, -1, op_errno, NULL); + STRIPE_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); return 0; } int32_t -stripe_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) +stripe_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; - xlator_list_t *trav = NULL; - int32_t op_errno = 1; + stripe_fd_ctx_t *fctx = NULL; + int i, eof_idx; + off_t dest_offset, tmp_offset; + int32_t op_errno = 1; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -3305,11 +3030,9 @@ stripe_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) VALIDATE_OR_GOTO (fd->inode, err); priv = this->private; - trav = this->children; /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -3318,22 +3041,60 @@ stripe_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) frame->local = local; local->call_count = priv->child_count; - while (trav) { - STACK_WIND (frame, stripe_truncate_cbk, trav->xlator, - trav->xlator->fops->ftruncate, fd, offset); - trav = trav->next; - } + inode_ctx_get(fd->inode, this, (uint64_t *) &fctx); + if (!fctx) { + gf_log(this->name, GF_LOG_ERROR, "no stripe context"); + op_errno = EINVAL; + goto err; + } + if (!fctx->stripe_count) { + gf_log(this->name, GF_LOG_ERROR, "no stripe count"); + op_errno = EINVAL; + goto err; + } + + local->fctx = fctx; + eof_idx = (offset / fctx->stripe_size) % fctx->stripe_count; + + for (i = 0; i < fctx->stripe_count; i++) { + if (!fctx->xl_array[i]) { + gf_log(this->name, GF_LOG_ERROR, "no xlator at index " + "%d", i); + op_errno = EINVAL; + goto err; + } + + if (fctx->stripe_coalesce) { + if (i < eof_idx) + tmp_offset = roof(offset, fctx->stripe_size * + fctx->stripe_count); + else if (i > eof_idx) + tmp_offset = floor(offset, fctx->stripe_size * + fctx->stripe_count); + else + tmp_offset = offset; + + dest_offset = coalesced_offset(tmp_offset, + fctx->stripe_size, fctx->stripe_count); + } else { + dest_offset = offset; + } + + STACK_WIND(frame, stripe_truncate_cbk, fctx->xl_array[i], + fctx->xl_array[i]->fops->ftruncate, fd, dest_offset, + NULL); + } return 0; err: - STRIPE_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL); + STRIPE_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); return 0; } int32_t stripe_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; @@ -3370,14 +3131,14 @@ stripe_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_ret = -1; STRIPE_STACK_UNWIND (fsyncdir, frame, local->op_ret, - local->op_errno); + local->op_errno, NULL); } out: return 0; } int32_t -stripe_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags) +stripe_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; @@ -3393,8 +3154,7 @@ stripe_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags) trav = this->children; /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -3405,20 +3165,20 @@ stripe_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags) while (trav) { STACK_WIND (frame, stripe_fsyncdir_cbk, trav->xlator, - trav->xlator->fops->fsyncdir, fd, flags); + trav->xlator->fops->fsyncdir, fd, flags, NULL); trav = trav->next; } return 0; err: - STRIPE_STACK_UNWIND (fsyncdir, frame, -1, op_errno); + STRIPE_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); return 0; } int32_t stripe_readv_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf) + int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { int32_t i = 0; int32_t callcnt = 0; @@ -3428,6 +3188,7 @@ stripe_readv_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt tmp_stbuf = {0,}; struct iobref *tmp_iobref = NULL; struct iobuf *iobuf = NULL; + call_frame_t *prev = NULL; if (!this || !frame || !frame->local) { gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); @@ -3435,13 +3196,16 @@ stripe_readv_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } local = frame->local; + prev = cookie; LOCK (&frame->lock); { callcnt = --local->call_count; - if (op_ret != -1) + if (op_ret != -1) { + correct_file_size(buf, local->fctx, prev); if (local->stbuf_size < buf->ia_size) local->stbuf_size = buf->ia_size; + } } UNLOCK (&frame->lock); @@ -3470,7 +3234,8 @@ stripe_readv_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, vec[count].iov_len = (local->replies[i].requested_size - local->replies[i].op_ret); - iobuf = iobuf_get (this->ctx->iobuf_pool); + iobuf = iobuf_get2 (this->ctx->iobuf_pool, + vec[count].iov_len); if (!iobuf) { gf_log (this->name, GF_LOG_ERROR, "Out of memory."); @@ -3479,9 +3244,11 @@ stripe_readv_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto done; } memset (iobuf->ptr, 0, vec[count].iov_len); - iobref_add (local->iobref, iobuf); vec[count].iov_base = iobuf->ptr; + iobref_add (local->iobref, iobuf); + iobuf_unref(iobuf); + op_ret += vec[count].iov_len; count++; } @@ -3499,11 +3266,10 @@ stripe_readv_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, GF_FREE (local->replies); tmp_iobref = local->iobref; STRIPE_STACK_UNWIND (readv, frame, op_ret, op_errno, vec, - count, &tmp_stbuf, tmp_iobref); + count, &tmp_stbuf, tmp_iobref, NULL); iobref_unref (tmp_iobref); - if (vec) - GF_FREE (vec); + GF_FREE (vec); } out: return 0; @@ -3516,7 +3282,7 @@ out: int32_t stripe_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct iatt *stbuf, struct iobref *iobref) + int32_t count, struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) { int32_t index = 0; int32_t callcnt = 0; @@ -3527,8 +3293,10 @@ stripe_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, stripe_local_t *local = NULL; struct iovec *final_vec = NULL; struct iatt tmp_stbuf = {0,}; + struct iatt *tmp_stbuf_p = NULL; //need it for a warning struct iobref *tmp_iobref = NULL; stripe_fd_ctx_t *fctx = NULL; + call_frame_t *prev = NULL; if (!this || !frame || !frame->local || !cookie) { gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); @@ -3537,6 +3305,7 @@ stripe_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; index = local->node_index; + prev = cookie; mframe = local->orig_frame; if (!mframe) goto out; @@ -3556,6 +3325,9 @@ stripe_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, mlocal->replies[index].stbuf = *stbuf; mlocal->replies[index].count = count; mlocal->replies[index].vector = iov_dup (vector, count); + + correct_file_size(stbuf, fctx, prev); + if (local->stbuf_size < stbuf->ia_size) local->stbuf_size = stbuf->ia_size; local->stbuf_blocks += stbuf->ia_blocks; @@ -3624,13 +3396,13 @@ stripe_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, GF_FREE (mlocal->replies); tmp_iobref = mlocal->iobref; /* work around for nfs truncated read. Bug 3774 */ - WIPE (&tmp_stbuf); + tmp_stbuf_p = &tmp_stbuf; + WIPE (tmp_stbuf_p); STRIPE_STACK_UNWIND (readv, mframe, op_ret, op_errno, final_vec, - final_count, &tmp_stbuf, tmp_iobref); + final_count, &tmp_stbuf, tmp_iobref, NULL); iobref_unref (tmp_iobref); - if (final_vec) - GF_FREE (final_vec); + GF_FREE (final_vec); } goto out; @@ -3642,7 +3414,7 @@ check_size: STACK_WIND (mframe, stripe_readv_fstat_cbk, (fctx->xl_array[index]), (fctx->xl_array[index])->fops->fstat, - mlocal->fd); + mlocal->fd, NULL); } out: @@ -3654,7 +3426,7 @@ end: int32_t stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t offset) + size_t size, off_t offset, uint32_t flags, dict_t *xdata) { int32_t op_errno = EINVAL; int32_t idx = 0; @@ -3667,6 +3439,7 @@ stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, uint64_t stripe_size = 0; off_t rounded_start = 0; off_t frame_offset = offset; + off_t dest_offset = 0; stripe_local_t *local = NULL; call_frame_t *rframe = NULL; stripe_local_t *rlocal = NULL; @@ -3677,7 +3450,7 @@ stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, VALIDATE_OR_GOTO (fd, err); VALIDATE_OR_GOTO (fd->inode, err); - fd_ctx_get (fd, this, &tmp_fctx); + inode_ctx_get (fd->inode, this, &tmp_fctx); if (!tmp_fctx) { op_errno = EBADFD; goto err; @@ -3685,6 +3458,8 @@ stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; stripe_size = fctx->stripe_size; + STRIPE_VALIDATE_FCTX (fctx, err); + if (!stripe_size) { gf_log (this->name, GF_LOG_DEBUG, "Wrong stripe size for the file"); @@ -3699,8 +3474,7 @@ stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, rounded_end = roof (offset+size, stripe_size); num_stripe = (rounded_end- rounded_start)/stripe_size; - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -3708,8 +3482,8 @@ stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, frame->local = local; /* This is where all the vectors should be copied. */ - local->replies = GF_CALLOC (num_stripe, sizeof (struct readv_replies), - gf_stripe_mt_readv_replies); + local->replies = GF_CALLOC (num_stripe, sizeof (struct stripe_replies), + gf_stripe_mt_stripe_replies); if (!local->replies) { op_errno = ENOMEM; goto err; @@ -3724,8 +3498,7 @@ stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, for (index = off_index; index < (num_stripe + off_index); index++) { rframe = copy_frame (frame); - rlocal = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + rlocal = mem_get0 (this->local_pool); if (!rlocal) { op_errno = ENOMEM; goto err; @@ -3739,9 +3512,16 @@ stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, rlocal->readv_size = frame_size; rframe->local = rlocal; idx = (index % fctx->stripe_count); + + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(frame_offset, + stripe_size, fctx->stripe_count); + else + dest_offset = frame_offset; + STACK_WIND (rframe, stripe_readv_cbk, fctx->xl_array[idx], fctx->xl_array[idx]->fops->readv, - fd, frame_size, frame_offset); + fd, frame_size, dest_offset, flags, xdata); frame_offset += frame_size; } @@ -3751,7 +3531,7 @@ err: if (rframe) STRIPE_STACK_DESTROY (rframe); - STRIPE_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL); + STRIPE_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); return 0; } @@ -3759,11 +3539,15 @@ err: int32_t stripe_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { int32_t callcnt = 0; stripe_local_t *local = NULL; + stripe_local_t *mlocal = NULL; call_frame_t *prev = NULL; + call_frame_t *mframe = NULL; + struct stripe_replies *reply = NULL; + int32_t i = 0; if (!this || !frame || !frame->local || !cookie) { gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); @@ -3772,39 +3556,82 @@ stripe_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, prev = cookie; local = frame->local; + mframe = local->orig_frame; + mlocal = mframe->local; LOCK(&frame->lock); { - callcnt = ++local->call_count; + callcnt = ++mlocal->call_count; + + mlocal->replies[local->node_index].op_ret = op_ret; + mlocal->replies[local->node_index].op_errno = op_errno; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - local->op_errno = op_errno; - local->op_ret = -1; - } if (op_ret >= 0) { - local->op_ret += op_ret; - local->post_buf = *postbuf; - local->pre_buf = *prebuf; + mlocal->post_buf = *postbuf; + mlocal->pre_buf = *prebuf; + + mlocal->prebuf_blocks += prebuf->ia_blocks; + mlocal->postbuf_blocks += postbuf->ia_blocks; + + correct_file_size(prebuf, mlocal->fctx, prev); + correct_file_size(postbuf, mlocal->fctx, prev); + + if (mlocal->prebuf_size < prebuf->ia_size) + mlocal->prebuf_size = prebuf->ia_size; + if (mlocal->postbuf_size < postbuf->ia_size) + mlocal->postbuf_size = postbuf->ia_size; } } UNLOCK (&frame->lock); - if ((callcnt == local->wind_count) && local->unwind) { - STRIPE_STACK_UNWIND (writev, frame, local->op_ret, - local->op_errno, &local->pre_buf, - &local->post_buf); + if ((callcnt == mlocal->wind_count) && mlocal->unwind) { + mlocal->pre_buf.ia_size = mlocal->prebuf_size; + mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks; + mlocal->post_buf.ia_size = mlocal->postbuf_size; + mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks; + + /* + * Only return the number of consecutively written bytes up until + * the first error. Only return an error if it occurs first. + * + * When a short write occurs, the application should retry at the + * appropriate offset, at which point we'll potentially pass back + * the error. + */ + for (i = 0, reply = mlocal->replies; i < mlocal->wind_count; + i++, reply++) { + if (reply->op_ret == -1) { + gf_log(this->name, GF_LOG_DEBUG, "reply %d " + "returned error %s", i, + strerror(reply->op_errno)); + if (!mlocal->op_ret) { + mlocal->op_ret = -1; + mlocal->op_errno = reply->op_errno; + } + break; + } + + mlocal->op_ret += reply->op_ret; + + if (reply->op_ret < reply->requested_size) + break; + } + + GF_FREE(mlocal->replies); + + STRIPE_STACK_UNWIND (writev, mframe, mlocal->op_ret, + mlocal->op_errno, &mlocal->pre_buf, + &mlocal->post_buf, NULL); } out: + STRIPE_STACK_DESTROY(frame); return 0; } int32_t stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref) + uint32_t flags, struct iobref *iobref, dict_t *xdata) { struct iovec *tmp_vec = NULL; stripe_local_t *local = NULL; @@ -3818,13 +3645,19 @@ stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t fill_size = 0; uint64_t stripe_size = 0; uint64_t tmp_fctx = 0; + off_t dest_offset = 0; + off_t rounded_start = 0; + off_t rounded_end = 0; + int32_t total_chunks = 0; + call_frame_t *wframe = NULL; + stripe_local_t *wlocal = NULL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); VALIDATE_OR_GOTO (fd->inode, err); - fd_ctx_get (fd, this, &tmp_fctx); + inode_ctx_get (fd->inode, this, &tmp_fctx); if (!tmp_fctx) { op_errno = EINVAL; goto err; @@ -3832,22 +3665,51 @@ stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; stripe_size = fctx->stripe_size; + STRIPE_VALIDATE_FCTX (fctx, err); + /* File has to be stripped across the child nodes */ for (idx = 0; idx< count; idx ++) { total_size += vector[idx].iov_len; } remaining_size = total_size; - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; } frame->local = local; local->stripe_size = stripe_size; + local->fctx = fctx; + if (!stripe_size) { + gf_log (this->name, GF_LOG_DEBUG, + "Wrong stripe size for the file"); + op_errno = EINVAL; + goto err; + } + + rounded_start = floor(offset, stripe_size); + rounded_end = roof(offset + total_size, stripe_size); + total_chunks = (rounded_end - rounded_start) / stripe_size; + local->replies = GF_CALLOC(total_chunks, sizeof(struct stripe_replies), + gf_stripe_mt_stripe_replies); + if (!local->replies) { + op_errno = ENOMEM; + goto err; + } + + total_chunks = 0; while (1) { + wframe = copy_frame(frame); + wlocal = mem_get0(this->local_pool); + if (!wlocal) { + op_errno = ENOMEM; + goto err; + } + wlocal->orig_frame = frame; + wframe->local = wlocal; + /* Send striped chunk of the vector to child nodes appropriately. */ idx = (((offset + offset_offset) / @@ -3875,47 +3737,589 @@ stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, if (remaining_size == 0) local->unwind = 1; - STACK_WIND (frame, stripe_writev_cbk, fctx->xl_array[idx], + /* + * Store off the request index (with respect to the chunk of the + * initial offset) and the size of the request. This is required + * in the callback to calculate an appropriate return value in + * the event of a write failure in one or more requests. + */ + wlocal->node_index = total_chunks; + local->replies[total_chunks].requested_size = fill_size; + + dest_offset = offset + offset_offset; + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(dest_offset, + local->stripe_size, fctx->stripe_count); + + STACK_WIND (wframe, stripe_writev_cbk, fctx->xl_array[idx], fctx->xl_array[idx]->fops->writev, fd, tmp_vec, - tmp_count, offset + offset_offset, iobref); + tmp_count, dest_offset, flags, iobref, + xdata); + GF_FREE (tmp_vec); offset_offset += fill_size; + total_chunks++; if (remaining_size == 0) break; } return 0; err: - STRIPE_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL); + if (wframe) + STRIPE_STACK_DESTROY(wframe); + + STRIPE_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); return 0; } int32_t -stripe_release (xlator_t *this, fd_t *fd) +stripe_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { + int32_t callcnt = 0; + stripe_local_t *local = NULL; + stripe_local_t *mlocal = NULL; + call_frame_t *prev = NULL; + call_frame_t *mframe = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + mframe = local->orig_frame; + mlocal = mframe->local; + + LOCK(&frame->lock); + { + callcnt = ++mlocal->call_count; + + if (op_ret == 0) { + mlocal->post_buf = *postbuf; + mlocal->pre_buf = *prebuf; + + mlocal->prebuf_blocks += prebuf->ia_blocks; + mlocal->postbuf_blocks += postbuf->ia_blocks; + + correct_file_size(prebuf, mlocal->fctx, prev); + correct_file_size(postbuf, mlocal->fctx, prev); + + if (mlocal->prebuf_size < prebuf->ia_size) + mlocal->prebuf_size = prebuf->ia_size; + if (mlocal->postbuf_size < postbuf->ia_size) + mlocal->postbuf_size = postbuf->ia_size; + } + + /* return the first failure */ + if (mlocal->op_ret == 0) { + mlocal->op_ret = op_ret; + mlocal->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if ((callcnt == mlocal->wind_count) && mlocal->unwind) { + mlocal->pre_buf.ia_size = mlocal->prebuf_size; + mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks; + mlocal->post_buf.ia_size = mlocal->postbuf_size; + mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks; + + STRIPE_STACK_UNWIND (fallocate, mframe, mlocal->op_ret, + mlocal->op_errno, &mlocal->pre_buf, + &mlocal->post_buf, NULL); + } +out: + STRIPE_STACK_DESTROY(frame); + return 0; +} + +int32_t +stripe_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + stripe_local_t *local = NULL; + stripe_fd_ctx_t *fctx = NULL; + int32_t op_errno = 1; + int32_t idx = 0; + int32_t offset_offset = 0; + int32_t remaining_size = 0; + off_t fill_size = 0; + uint64_t stripe_size = 0; uint64_t tmp_fctx = 0; + off_t dest_offset = 0; + call_frame_t *fframe = NULL; + stripe_local_t *flocal = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + inode_ctx_get (fd->inode, this, &tmp_fctx); + if (!tmp_fctx) { + op_errno = EINVAL; + goto err; + } + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; + + STRIPE_VALIDATE_FCTX (fctx, err); + + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + frame->local = local; + local->stripe_size = stripe_size; + local->fctx = fctx; + + if (!stripe_size) { + gf_log (this->name, GF_LOG_DEBUG, + "Wrong stripe size for the file"); + op_errno = EINVAL; + goto err; + } + + while (1) { + fframe = copy_frame(frame); + flocal = mem_get0(this->local_pool); + if (!flocal) { + op_errno = ENOMEM; + goto err; + } + flocal->orig_frame = frame; + fframe->local = flocal; + + /* send fallocate request to the associated child node */ + idx = (((offset + offset_offset) / + local->stripe_size) % fctx->stripe_count); + + fill_size = (local->stripe_size - + ((offset + offset_offset) % local->stripe_size)); + if (fill_size > remaining_size) + fill_size = remaining_size; + + remaining_size -= fill_size; + + local->wind_count++; + if (remaining_size == 0) + local->unwind = 1; + + dest_offset = offset + offset_offset; + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(dest_offset, + local->stripe_size, fctx->stripe_count); + + /* + * TODO: Create a separate handler for coalesce mode that sends a + * single fallocate per-child (since the ranges are linear). + */ + STACK_WIND(fframe, stripe_fallocate_cbk, fctx->xl_array[idx], + fctx->xl_array[idx]->fops->fallocate, fd, mode, + dest_offset, fill_size, xdata); + + offset_offset += fill_size; + if (remaining_size == 0) + break; + } + + return 0; +err: + if (fframe) + STRIPE_STACK_DESTROY(fframe); + + STRIPE_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + + +int32_t +stripe_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t callcnt = 0; + stripe_local_t *local = NULL; + stripe_local_t *mlocal = NULL; + call_frame_t *prev = NULL; + call_frame_t *mframe = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + mframe = local->orig_frame; + mlocal = mframe->local; + + LOCK(&frame->lock); + { + callcnt = ++mlocal->call_count; + + if (op_ret == 0) { + mlocal->post_buf = *postbuf; + mlocal->pre_buf = *prebuf; + + mlocal->prebuf_blocks += prebuf->ia_blocks; + mlocal->postbuf_blocks += postbuf->ia_blocks; + + correct_file_size(prebuf, mlocal->fctx, prev); + correct_file_size(postbuf, mlocal->fctx, prev); + + if (mlocal->prebuf_size < prebuf->ia_size) + mlocal->prebuf_size = prebuf->ia_size; + if (mlocal->postbuf_size < postbuf->ia_size) + mlocal->postbuf_size = postbuf->ia_size; + } + + /* return the first failure */ + if (mlocal->op_ret == 0) { + mlocal->op_ret = op_ret; + mlocal->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if ((callcnt == mlocal->wind_count) && mlocal->unwind) { + mlocal->pre_buf.ia_size = mlocal->prebuf_size; + mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks; + mlocal->post_buf.ia_size = mlocal->postbuf_size; + mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks; + + STRIPE_STACK_UNWIND (discard, mframe, mlocal->op_ret, + mlocal->op_errno, &mlocal->pre_buf, + &mlocal->post_buf, NULL); + } +out: + STRIPE_STACK_DESTROY(frame); + return 0; +} + +int32_t +stripe_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + stripe_local_t *local = NULL; stripe_fd_ctx_t *fctx = NULL; + int32_t op_errno = 1; + int32_t idx = 0; + int32_t offset_offset = 0; + int32_t remaining_size = 0; + off_t fill_size = 0; + uint64_t stripe_size = 0; + uint64_t tmp_fctx = 0; + off_t dest_offset = 0; + call_frame_t *fframe = NULL; + stripe_local_t *flocal = NULL; + VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); - fd_ctx_del (fd, this, &tmp_fctx); + inode_ctx_get (fd->inode, this, &tmp_fctx); if (!tmp_fctx) { + op_errno = EINVAL; goto err; } + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; + + STRIPE_VALIDATE_FCTX (fctx, err); + + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + frame->local = local; + local->stripe_size = stripe_size; + local->fctx = fctx; + + if (!stripe_size) { + gf_log (this->name, GF_LOG_DEBUG, + "Wrong stripe size for the file"); + op_errno = EINVAL; + goto err; + } + + while (1) { + fframe = copy_frame(frame); + flocal = mem_get0(this->local_pool); + if (!flocal) { + op_errno = ENOMEM; + goto err; + } + flocal->orig_frame = frame; + fframe->local = flocal; + + /* send discard request to the associated child node */ + idx = (((offset + offset_offset) / + local->stripe_size) % fctx->stripe_count); + + fill_size = (local->stripe_size - + ((offset + offset_offset) % local->stripe_size)); + if (fill_size > remaining_size) + fill_size = remaining_size; + + remaining_size -= fill_size; + + local->wind_count++; + if (remaining_size == 0) + local->unwind = 1; + + dest_offset = offset + offset_offset; + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(dest_offset, + local->stripe_size, fctx->stripe_count); + + /* + * TODO: Create a separate handler for coalesce mode that sends a + * single discard per-child (since the ranges are linear). + */ + STACK_WIND(fframe, stripe_discard_cbk, fctx->xl_array[idx], + fctx->xl_array[idx]->fops->discard, fd, dest_offset, + fill_size, xdata); + + offset_offset += fill_size; + if (remaining_size == 0) + break; + } + + return 0; +err: + if (fframe) + STRIPE_STACK_DESTROY(fframe); + + STRIPE_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +stripe_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t callcnt = 0; + stripe_local_t *local = NULL; + stripe_local_t *mlocal = NULL; + call_frame_t *prev = NULL; + call_frame_t *mframe = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + mframe = local->orig_frame; + mlocal = mframe->local; + + LOCK(&frame->lock); + { + callcnt = ++mlocal->call_count; + + if (op_ret == 0) { + mlocal->post_buf = *postbuf; + mlocal->pre_buf = *prebuf; + + mlocal->prebuf_blocks += prebuf->ia_blocks; + mlocal->postbuf_blocks += postbuf->ia_blocks; + + correct_file_size(prebuf, mlocal->fctx, prev); + correct_file_size(postbuf, mlocal->fctx, prev); + + if (mlocal->prebuf_size < prebuf->ia_size) + mlocal->prebuf_size = prebuf->ia_size; + if (mlocal->postbuf_size < postbuf->ia_size) + mlocal->postbuf_size = postbuf->ia_size; + } + /* return the first failure */ + if (mlocal->op_ret == 0) { + mlocal->op_ret = op_ret; + mlocal->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if ((callcnt == mlocal->wind_count) && mlocal->unwind) { + mlocal->pre_buf.ia_size = mlocal->prebuf_size; + mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks; + mlocal->post_buf.ia_size = mlocal->postbuf_size; + mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks; + + STRIPE_STACK_UNWIND (zerofill, mframe, mlocal->op_ret, + mlocal->op_errno, &mlocal->pre_buf, + &mlocal->post_buf, NULL); + } +out: + STRIPE_STACK_DESTROY(frame); + return 0; +} + +int32_t +stripe_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + stripe_local_t *local = NULL; + stripe_fd_ctx_t *fctx = NULL; + int32_t op_errno = 1; + int32_t idx = 0; + int32_t offset_offset = 0; + int32_t remaining_size = 0; + off_t fill_size = 0; + uint64_t stripe_size = 0; + uint64_t tmp_fctx = 0; + off_t dest_offset = 0; + call_frame_t *fframe = NULL; + stripe_local_t *flocal = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + inode_ctx_get (fd->inode, this, &tmp_fctx); + if (!tmp_fctx) { + op_errno = EINVAL; + goto err; + } fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; - if (!fctx->static_array) - GF_FREE (fctx->xl_array); + STRIPE_VALIDATE_FCTX (fctx, err); - GF_FREE (fctx); + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; + + STRIPE_VALIDATE_FCTX (fctx, err); + + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; + + STRIPE_VALIDATE_FCTX (fctx, err); + + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + frame->local = local; + local->stripe_size = stripe_size; + local->fctx = fctx; + + if (!stripe_size) { + gf_log (this->name, GF_LOG_DEBUG, + "Wrong stripe size for the file"); + op_errno = EINVAL; + goto err; + } + + while (1) { + fframe = copy_frame(frame); + flocal = mem_get0(this->local_pool); + if (!flocal) { + op_errno = ENOMEM; + goto err; + } + flocal->orig_frame = frame; + fframe->local = flocal; + + idx = (((offset + offset_offset) / + local->stripe_size) % fctx->stripe_count); + + fill_size = (local->stripe_size - + ((offset + offset_offset) % local->stripe_size)); + if (fill_size > remaining_size) + fill_size = remaining_size; + remaining_size -= fill_size; + + local->wind_count++; + if (remaining_size == 0) + local->unwind = 1; + + dest_offset = offset + offset_offset; + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(dest_offset, + local->stripe_size, + fctx->stripe_count); + + STACK_WIND(fframe, stripe_zerofill_cbk, fctx->xl_array[idx], + fctx->xl_array[idx]->fops->zerofill, fd, + dest_offset, fill_size, xdata); + offset_offset += fill_size; + if (remaining_size == 0) + break; + } + + return 0; err: + if (fframe) + STRIPE_STACK_DESTROY(fframe); + + STRIPE_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +stripe_release (xlator_t *this, fd_t *fd) +{ return 0; } +int +stripe_forget (xlator_t *this, inode_t *inode) +{ + uint64_t tmp_fctx = 0; + stripe_fd_ctx_t *fctx = NULL; + + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (inode, err); + + (void) inode_ctx_del (inode, this, &tmp_fctx); + if (!tmp_fctx) { + goto err; + } + + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + + if (!fctx->static_array) + GF_FREE (fctx->xl_array); + + GF_FREE (fctx); +err: + return 0; +} int32_t notify (xlator_t *this, int32_t event, void *data, ...) @@ -3923,6 +4327,7 @@ notify (xlator_t *this, int32_t event, void *data, ...) stripe_private_t *priv = NULL; int down_client = 0; int i = 0; + gf_boolean_t heard_from_all_children = _gf_false; if (!this) return 0; @@ -3934,30 +4339,34 @@ notify (xlator_t *this, int32_t event, void *data, ...) switch (event) { case GF_EVENT_CHILD_UP: - case GF_EVENT_CHILD_CONNECTING: { /* get an index number to set */ for (i = 0; i < priv->child_count; i++) { if (data == priv->xl_array[i]) break; } - priv->state[i] = 1; - for (i = 0; i < priv->child_count; i++) { - if (!priv->state[i]) - down_client++; + + if (priv->child_count == i) { + gf_log (this->name, GF_LOG_ERROR, + "got GF_EVENT_CHILD_UP bad subvolume %s", + data? ((xlator_t *)data)->name: NULL); + break; } LOCK (&priv->lock); { - priv->nodes_down = down_client; if (data == FIRST_CHILD (this)) priv->first_child_down = 0; - if (!priv->nodes_down) - default_notify (this, event, data); + priv->last_event[i] = event; } UNLOCK (&priv->lock); } break; + case GF_EVENT_CHILD_CONNECTING: + { + // 'CONNECTING' doesn't ensure its CHILD_UP, so do nothing + goto out; + } case GF_EVENT_CHILD_DOWN: { /* get an index number to set */ @@ -3965,20 +4374,19 @@ notify (xlator_t *this, int32_t event, void *data, ...) if (data == priv->xl_array[i]) break; } - priv->state[i] = 0; - for (i = 0; i < priv->child_count; i++) { - if (!priv->state[i]) - down_client++; + + if (priv->child_count == i) { + gf_log (this->name, GF_LOG_ERROR, + "got GF_EVENT_CHILD_DOWN bad subvolume %s", + data? ((xlator_t *)data)->name: NULL); + break; } LOCK (&priv->lock); { - priv->nodes_down = down_client; - if (data == FIRST_CHILD (this)) priv->first_child_down = 1; - if (priv->nodes_down) - default_notify (this, event, data); + priv->last_event[i] = event; } UNLOCK (&priv->lock); } @@ -3988,79 +4396,252 @@ notify (xlator_t *this, int32_t event, void *data, ...) { /* */ default_notify (this, event, data); + goto out; } break; } + // Consider child as down if it's last_event is not CHILD_UP + for (i = 0, down_client = 0; i < priv->child_count; i++) + if (priv->last_event[i] != GF_EVENT_CHILD_UP) + down_client++; + + LOCK (&priv->lock); + { + priv->nodes_down = down_client; + } + UNLOCK (&priv->lock); + + heard_from_all_children = _gf_true; + for (i = 0; i < priv->child_count; i++) + if (!priv->last_event[i]) + heard_from_all_children = _gf_false; + + if (heard_from_all_children) + default_notify (this, event, data); +out: return 0; } int -set_stripe_block_size (xlator_t *this, stripe_private_t *priv, char *data) +stripe_setxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, dict_t *xdata) { - int ret = -1; - char *tmp_str = NULL; - char *tmp_str1 = NULL; - char *dup_str = NULL; - char *stripe_str = NULL; - char *pattern = NULL; - char *num = NULL; - struct stripe_options *temp_stripeopt = NULL; - struct stripe_options *stripe_opt = NULL; - - if (!this || !priv || !data) - goto out; + int ret = -1; + int call_cnt = 0; + stripe_local_t *local = NULL; - /* Get the pattern for striping. - "option block-size *avi:10MB" etc */ - stripe_str = strtok_r (data, ",", &tmp_str); - while (stripe_str) { - dup_str = gf_strdup (stripe_str); - stripe_opt = CALLOC (1, sizeof (struct stripe_options)); - if (!stripe_opt) { - GF_FREE (dup_str); - goto out; - } + if (!frame || !frame->local || !this) { + gf_log ("", GF_LOG_ERROR, "Possible NULL deref"); + return ret; + } - pattern = strtok_r (dup_str, ":", &tmp_str1); - num = strtok_r (NULL, ":", &tmp_str1); - if (!num) { - num = pattern; - pattern = "*"; - } - if (gf_string2bytesize (num, &stripe_opt->block_size) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "invalid number format \"%s\"", num); - goto out; - } + local = frame->local; - if (stripe_opt->block_size < 512) { - gf_log (this->name, GF_LOG_ERROR, "Invalid Block-size: " - "%s. Should be atleast 512 bytes", num); - goto out; + LOCK (&frame->lock); + { + call_cnt = --local->wind_count; + + /** + * We overwrite ->op_* values here for subsequent faliure + * conditions, hence we propogate the last errno down the + * stack. + */ + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unlock; } - if (stripe_opt->block_size % 512) { - gf_log (this->name, GF_LOG_ERROR, "Block-size: %s should" - " be a multiple of 512 bytes", num); - goto out; + } + + unlock: + UNLOCK (&frame->lock); + + if (!call_cnt) { + STRIPE_STACK_UNWIND (setxattr, frame, local->op_ret, + local->op_errno, xdata); + } + + return 0; +} + +#ifdef HAVE_BD_XLATOR +int +stripe_is_bd (dict_t *this, char *key, data_t *value, void *data) +{ + gf_boolean_t *is_bd = data; + + if (data == NULL) + return 0; + + if (XATTR_IS_BD (key)) + *is_bd = _gf_true; + + return 0; +} + +inline gf_boolean_t +stripe_setxattr_is_bd (dict_t *dict) +{ + gf_boolean_t is_bd = _gf_false; + + if (dict == NULL) + goto out; + + dict_foreach (dict, stripe_is_bd, &is_bd); +out: + return is_bd; +} +#else +#define stripe_setxattr_is_bd(dict) _gf_false +#endif + +int +stripe_setxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *dict, int flags, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + xlator_list_t *trav = NULL; + stripe_private_t *priv = NULL; + stripe_local_t *local = NULL; + int i = 0; + gf_boolean_t is_bd = _gf_false; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + + GF_IF_INTERNAL_XATTR_GOTO ("trusted.*stripe*", dict, + op_errno, err); + + priv = this->private; + trav = this->children; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + frame->local = local; + local->wind_count = priv->child_count; + local->op_ret = local->op_errno = 0; + + is_bd = stripe_setxattr_is_bd (dict); + + /** + * Set xattrs for directories on all subvolumes. Additionally + * this power is only given to a special client. Bd xlator + * also needs xattrs for regular files (ie LVs) + */ + if (((frame->root->pid == GF_CLIENT_PID_GSYNCD) && + IA_ISDIR (loc->inode->ia_type)) || is_bd) { + for (i = 0; i < priv->child_count; i++, trav = trav->next) { + STACK_WIND (frame, stripe_setxattr_cbk, + trav->xlator, trav->xlator->fops->setxattr, + loc, dict, flags, xdata); } + } else { + local->wind_count = 1; + STACK_WIND (frame, stripe_setxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + loc, dict, flags, xdata); + } - memcpy (stripe_opt->path_pattern, pattern, strlen (pattern)); + return 0; +err: + STRIPE_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + return 0; +} - gf_log (this->name, GF_LOG_DEBUG, - "block-size : pattern %s : size %"PRId64, - stripe_opt->path_pattern, stripe_opt->block_size); - if (!priv->pattern) { - priv->pattern = stripe_opt; - } else { - temp_stripeopt = priv->pattern; - while (temp_stripeopt->next) - temp_stripeopt = temp_stripeopt->next; - temp_stripeopt->next = stripe_opt; +int +stripe_fsetxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, dict_t *xdata) +{ + STRIPE_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +stripe_is_special_key (dict_t *this, + char *key, + data_t *value, + void *data) +{ + gf_boolean_t *is_special = NULL; + + if (data == NULL) { + goto out; + } + + is_special = data; + + if (XATTR_IS_LOCKINFO (key) || XATTR_IS_BD (key)) + *is_special = _gf_true; + +out: + return 0; +} + +int32_t +stripe_fsetxattr_everyone_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xdata) +{ + int call_count = 0; + stripe_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + call_count = --local->wind_count; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; } - stripe_str = strtok_r (NULL, ",", &tmp_str); - GF_FREE (dup_str); + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + STRIPE_STACK_UNWIND (fsetxattr, frame, local->op_ret, + local->op_errno, NULL); + } + return 0; +} + +int +stripe_fsetxattr_to_everyone (call_frame_t *frame, xlator_t *this, fd_t *fd, + dict_t *dict, int flags, dict_t *xdata) +{ + xlator_list_t *trav = NULL; + stripe_private_t *priv = NULL; + int ret = -1; + stripe_local_t *local = NULL; + + priv = this->private; + + local = mem_get0 (this->local_pool); + if (local == NULL) { + goto out; + } + + frame->local = local; + + local->wind_count = priv->child_count; + + trav = this->children; + + while (trav) { + STACK_WIND (frame, stripe_fsetxattr_everyone_cbk, + trav->xlator, trav->xlator->fops->fsetxattr, + fd, dict, flags, xdata); + trav = trav->next; } ret = 0; @@ -4068,80 +4649,220 @@ out: return ret; } -int32_t -stripe_iatt_merge (struct iatt *from, struct iatt *to) +inline gf_boolean_t +stripe_fsetxattr_is_special (dict_t *dict) { - if (to->ia_size < from->ia_size) - to->ia_size = from->ia_size; - if (to->ia_mtime < from->ia_mtime) - to->ia_mtime = from->ia_mtime; - if (to->ia_ctime < from->ia_ctime) - to->ia_ctime = from->ia_ctime; - if (to->ia_atime < from->ia_atime) - to->ia_atime = from->ia_atime; - return 0; + gf_boolean_t is_spl = _gf_false; + + if (dict == NULL) { + goto out; + } + + dict_foreach (dict, stripe_is_special_key, &is_spl); + +out: + return is_spl; } -int32_t -stripe_readdirp_entry_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf) +int +stripe_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + dict_t *dict, int flags, dict_t *xdata) { - gf_dirent_t *entry = NULL; - stripe_local_t *local = NULL; - int32_t done = 0; + int32_t op_ret = -1, ret = -1, op_errno = EINVAL; + gf_boolean_t is_spl = _gf_false; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + GF_IF_INTERNAL_XATTR_GOTO ("trusted.*stripe*", dict, + op_errno, err); + + is_spl = stripe_fsetxattr_is_special (dict); + if (is_spl) { + ret = stripe_fsetxattr_to_everyone (frame, this, fd, dict, + flags, xdata); + if (ret < 0) { + op_errno = ENOMEM; + goto err; + } - if (!this || !frame || !frame->local || !cookie) { - gf_log (this->name, GF_LOG_DEBUG, "possible NULL deref"); goto out; } - entry = cookie; + + STACK_WIND (frame, stripe_fsetxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + fd, dict, flags, xdata); +out: + return 0; +err: + STRIPE_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, NULL); + return 0; +} + +int +stripe_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + STRIPE_STACK_UNWIND (removexattr, frame, op_ret, op_errno, xdata); + return 0; +} + +int +stripe_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + + VALIDATE_OR_GOTO (this, err); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.*stripe*", + name, op_errno, err); + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (loc, err); + + STACK_WIND (frame, stripe_removexattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, + loc, name, xdata); + return 0; +err: + STRIPE_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); + return 0; +} + + +int +stripe_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + STRIPE_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, xdata); + return 0; +} + +int +stripe_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.*stripe*", + name, op_errno, err); + + STACK_WIND (frame, stripe_fremovexattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, + fd, name, xdata); + return 0; + err: + STRIPE_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, xdata); + return 0; +} + +int32_t +stripe_readdirp_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, + dict_t *xattr, struct iatt *parent) +{ + stripe_local_t *local = NULL; + call_frame_t *main_frame = NULL; + stripe_local_t *main_local = NULL; + gf_dirent_t *entry = NULL; + call_frame_t *prev = NULL; + int done = 0; + local = frame->local; + prev = cookie; + + entry = local->dirent; + + main_frame = local->orig_frame; + main_local = main_frame->local; LOCK (&frame->lock); { - local->wind_count--; - if (!local->wind_count) + local->call_count--; + if (!local->call_count) done = 1; if (op_ret == -1) { local->op_errno = op_errno; local->op_ret = op_ret; goto unlock; } - stripe_iatt_merge (buf, &entry->d_stat); + + if (stripe_ctx_handle(this, prev, local, xattr)) + gf_log(this->name, GF_LOG_ERROR, + "Error getting fctx info from dict."); + + correct_file_size(stbuf, local->fctx, prev); + + stripe_iatt_merge (stbuf, &entry->d_stat); + local->stbuf_blocks += stbuf->ia_blocks; } unlock: UNLOCK(&frame->lock); if (done) { - frame->local = NULL; - STRIPE_STACK_UNWIND (readdir, frame, local->op_ret, - local->op_errno, &local->entries); + inode_ctx_put (entry->inode, this, + (uint64_t) (long)local->fctx); - gf_dirent_free (&local->entries); + done = 0; + LOCK (&main_frame->lock); + { + main_local->wind_count--; + if (!main_local->wind_count) + done = 1; + if (local->op_ret == -1) { + main_local->op_errno = local->op_errno; + main_local->op_ret = local->op_ret; + } + entry->d_stat.ia_blocks = local->stbuf_blocks; + } + UNLOCK (&main_frame->lock); + if (done) { + main_frame->local = NULL; + STRIPE_STACK_UNWIND (readdir, main_frame, + main_local->op_ret, + main_local->op_errno, + &main_local->entries, NULL); + gf_dirent_free (&main_local->entries); + stripe_local_wipe (main_local); + mem_put (main_local); + } + frame->local = NULL; stripe_local_wipe (local); - GF_FREE (local); + mem_put (local); + STRIPE_STACK_DESTROY (frame); } -out: - return 0; + return 0; } int32_t stripe_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *orig_entries) + int32_t op_ret, int32_t op_errno, + gf_dirent_t *orig_entries, dict_t *xdata) { stripe_local_t *local = NULL; call_frame_t *prev = NULL; gf_dirent_t *local_entry = NULL; - int32_t ret = -1; gf_dirent_t *tmp_entry = NULL; xlator_list_t *trav = NULL; loc_t loc = {0, }; - inode_t *inode = NULL; - char *path; int32_t count = 0; stripe_private_t *priv = NULL; int32_t subvols = 0; + dict_t *xattrs = NULL; + call_frame_t *local_frame = NULL; + stripe_local_t *local_ent = NULL; if (!this || !frame || !frame->local || !cookie) { gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); @@ -4167,8 +4888,9 @@ stripe_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_ret = op_ret; list_splice_init (&orig_entries->list, &local->entries.list); - local->wind_count = op_ret * subvols; + local->wind_count = op_ret; } + } unlock: UNLOCK (&frame->lock); @@ -4176,8 +4898,10 @@ unlock: if (op_ret == -1) goto out; + xattrs = dict_new (); + if (xattrs) + (void) stripe_xattr_request_build (this, xattrs, 0, 0, 0, 0); count = op_ret; - ret = 0; list_for_each_entry_safe (local_entry, tmp_entry, (&local->entries.list), list) { @@ -4186,67 +4910,73 @@ unlock: if (!IA_ISREG (local_entry->d_stat.ia_type)) { LOCK (&frame->lock); { - local->wind_count -= subvols; + local->wind_count--; count = local->wind_count; } UNLOCK (&frame->lock); continue; } - inode = inode_new (local->fd->inode->table); - if (!inode) + local_frame = copy_frame (frame); + + if (!local_frame) { + op_errno = ENOMEM; + op_ret = -1; goto out; + } - loc.ino = inode->ino = local_entry->d_ino; - loc.inode = inode; - loc.parent = local->fd->inode; - ret = inode_path (local->fd->inode, local_entry->d_name, &path); - if (ret != -1) { - loc.path = path; - } else if (inode) { - ret = inode_path (inode, NULL, &path); - if (ret != -1) { - loc.path = path; - } else { - goto out; - } + local_ent = mem_get0 (this->local_pool); + if (!local_ent) { + op_errno = ENOMEM; + op_ret = -1; + goto out; } - loc.name = strrchr (loc.path, '/'); - loc.name++; + loc.inode = inode_ref (local_entry->inode); + + uuid_copy (loc.gfid, local_entry->d_stat.ia_gfid); + + local_ent->orig_frame = frame; + + local_ent->call_count = subvols; + + local_ent->dirent = local_entry; + + local_frame->local = local_ent; + trav = this->children; while (trav) { - STACK_WIND_COOKIE (frame, stripe_readdirp_entry_stat_cbk, - local_entry, trav->xlator, - trav->xlator->fops->stat, &loc); + STACK_WIND (local_frame, stripe_readdirp_lookup_cbk, + trav->xlator, trav->xlator->fops->lookup, + &loc, xattrs); trav = trav->next; } - inode_unref (loc.inode); + loc_wipe (&loc); } out: if (!count) { /* all entries are directories */ frame->local = NULL; STRIPE_STACK_UNWIND (readdir, frame, local->op_ret, - local->op_errno, &local->entries); + local->op_errno, &local->entries, NULL); gf_dirent_free (&local->entries); stripe_local_wipe (local); - GF_FREE (local); + mem_put (local); } - + if (xattrs) + dict_unref (xattrs); return 0; } int32_t stripe_readdirp (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off) + fd_t *fd, size_t size, off_t off, dict_t *xdata) { stripe_local_t *local = NULL; stripe_private_t *priv = NULL; xlator_list_t *trav = NULL; int op_errno = -1; - VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); @@ -4260,8 +4990,7 @@ stripe_readdirp (call_frame_t *frame, xlator_t *this, } /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -4281,15 +5010,16 @@ stripe_readdirp (call_frame_t *frame, xlator_t *this, goto err; STACK_WIND (frame, stripe_readdirp_cbk, trav->xlator, - trav->xlator->fops->readdirp, fd, size, off); + trav->xlator->fops->readdirp, fd, size, off, xdata); return 0; err: op_errno = (op_errno == -1) ? errno : op_errno; - STRIPE_STACK_UNWIND (readdir, frame, -1, op_errno, NULL); + STRIPE_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL); return 0; } + int32_t mem_acct_init (xlator_t *this) { @@ -4310,21 +5040,86 @@ out: return ret; } +static int +clear_pattern_list (stripe_private_t *priv) +{ + struct stripe_options *prev = NULL; + struct stripe_options *trav = NULL; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("stripe", priv, out); + + trav = priv->pattern; + priv->pattern = NULL; + while (trav) { + prev = trav; + trav = trav->next; + GF_FREE (prev); + } + + ret = 0; + out: + return ret; + + +} + int reconfigure (xlator_t *this, dict_t *options) { - stripe_private_t *priv = NULL; - int ret = -1; + stripe_private_t *priv = NULL; + data_t *data = NULL; + int ret = -1; + volume_option_t *opt = NULL; - priv = this->private; + GF_ASSERT (this); + GF_ASSERT (this->private); + + priv = this->private; - GF_OPTION_RECONF ("block-size", priv->block_size, options, size, out); ret = 0; -out: - return ret; + LOCK (&priv->lock); + { + ret = clear_pattern_list (priv); + if (ret) + goto unlock; + + data = dict_get (options, "block-size"); + if (data) { + ret = set_stripe_block_size (this, priv, data->data); + if (ret) + goto unlock; + } else { + opt = xlator_volume_option_get (this, "block-size"); + if (!opt) { + gf_log (this->name, GF_LOG_WARNING, + "option 'block-size' not found"); + ret = -1; + goto unlock; + } + + if (gf_string2bytesize (opt->default_value, &priv->block_size)){ + gf_log (this->name, GF_LOG_ERROR, + "Unable to set default block-size "); + ret = -1; + goto unlock; + } + } + + GF_OPTION_RECONF("coalesce", priv->coalesce, options, bool, + unlock); + } + unlock: + UNLOCK (&priv->lock); + if (ret) + goto out; + + ret = 0; + out: + return ret; } @@ -4337,6 +5132,7 @@ int32_t init (xlator_t *this) { stripe_private_t *priv = NULL; + volume_option_t *opt = NULL; xlator_list_t *trav = NULL; data_t *data = NULL; int32_t count = 0; @@ -4380,9 +5176,9 @@ init (xlator_t *this) if (!priv->xl_array) goto out; - priv->state = GF_CALLOC (count, sizeof (int8_t), - gf_stripe_mt_int8_t); - if (!priv->state) + priv->last_event = GF_CALLOC (count, sizeof (int), + gf_stripe_mt_int32_t); + if (!priv->last_event) goto out; priv->child_count = count; @@ -4402,30 +5198,56 @@ init (xlator_t *this) goto out; } - - GF_OPTION_INIT ("block-size", priv->block_size, size, out); - - /* option stripe-pattern *avi:1GB,*pdf:4096 */ - data = dict_get (this->options, "block-size"); - if (data) { - ret = set_stripe_block_size (this, priv, data->data); - if (ret) - goto out; + ret = 0; + LOCK (&priv->lock); + { + opt = xlator_volume_option_get (this, "block-size"); + if (!opt) { + gf_log (this->name, GF_LOG_WARNING, + "option 'block-size' not found"); + ret = -1; + goto unlock; + } + if (gf_string2bytesize (opt->default_value, &priv->block_size)){ + gf_log (this->name, GF_LOG_ERROR, + "Unable to set default block-size "); + ret = -1; + goto unlock; + } + /* option stripe-pattern *avi:1GB,*pdf:16K */ + data = dict_get (this->options, "block-size"); + if (data) { + ret = set_stripe_block_size (this, priv, data->data); + if (ret) + goto unlock; + } } + unlock: + UNLOCK (&priv->lock); + if (ret) + goto out; GF_OPTION_INIT ("use-xattr", priv->xattr_supported, bool, out); - /* notify related */ priv->nodes_down = priv->child_count; - this->private = priv; + GF_OPTION_INIT("coalesce", priv->coalesce, bool, out); + + this->local_pool = mem_pool_new (stripe_local_t, 128); + if (!this->local_pool) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + goto out; + } + + this->private = priv; ret = 0; out: if (ret) { if (priv) { - if (priv->xl_array) - GF_FREE (priv->xl_array); + GF_FREE (priv->xl_array); GF_FREE (priv); } } @@ -4449,15 +5271,15 @@ fini (xlator_t *this) priv = this->private; if (priv) { this->private = NULL; - if (priv->xl_array) - GF_FREE (priv->xl_array); + GF_FREE (priv->xl_array); trav = priv->pattern; while (trav) { prev = trav; trav = trav->next; - FREE (prev); + GF_FREE (prev); } + GF_FREE (priv->last_event); LOCK_DESTROY (&priv->lock); GF_FREE (priv); } @@ -4468,17 +5290,50 @@ out: int32_t stripe_getxattr_unwind (call_frame_t *frame, - int op_ret, int op_errno, dict_t *dict) + int op_ret, int op_errno, dict_t *dict, dict_t *xdata) { - STRIPE_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); + STRIPE_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); return 0; } +int +stripe_internal_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, + dict_t *xdata) +{ + + char size_key[256] = {0,}; + char index_key[256] = {0,}; + char count_key[256] = {0,}; + char coalesce_key[256] = {0,}; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (frame->local, out); + + if (!xattr || (op_ret == -1)) + goto out; + + sprintf (size_key, "trusted.%s.stripe-size", this->name); + sprintf (count_key, "trusted.%s.stripe-count", this->name); + sprintf (index_key, "trusted.%s.stripe-index", this->name); + sprintf (coalesce_key, "trusted.%s.stripe-coalesce", this->name); + + dict_del (xattr, size_key); + dict_del (xattr, count_key); + dict_del (xattr, index_key); + dict_del (xattr, coalesce_key); + +out: + STRIPE_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata); + + return 0; + +} int stripe_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr) + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { int call_cnt = 0; stripe_local_t *local = NULL; @@ -4508,92 +5363,39 @@ stripe_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, out: if (!call_cnt) { STRIPE_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno, - local->xattr); + local->xattr, xdata); } return 0; } int32_t -stripe_pathinfo_aggregate (char *buffer, stripe_local_t *local, int32_t *total) +stripe_vgetxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) { - int32_t i = 0; - int32_t ret = -1; - int32_t len = 0; - char *sbuf = NULL; - stripe_xattr_sort_t *xattr = NULL; - - if (!buffer || !local || !local->xattr_list) - goto out; - - sbuf = buffer; - - for (i = 0; i < local->nallocs; i++) { - xattr = local->xattr_list + i; - len = xattr->pathinfo_len; - - if (len && xattr && xattr->pathinfo) { - memcpy (buffer, xattr->pathinfo, len); - buffer += len; - *buffer++ = ' '; - } - } - - *--buffer = '\0'; - if (total) - *total = buffer - sbuf; - ret = 0; - - out: - return ret; -} - -int32_t -stripe_free_pathinfo_str (stripe_local_t *local) -{ - int32_t i = 0; - int32_t ret = -1; - stripe_xattr_sort_t *xattr = NULL; - - if (!local || !local->xattr_list) - goto out; - - for (i = 0; i < local->nallocs; i++) { - xattr = local->xattr_list + i; - - if (xattr && xattr->pathinfo) - GF_FREE (xattr->pathinfo); - } - - ret = 0; - out: - return ret; -} - -int32_t -stripe_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict) { stripe_local_t *local = NULL; int32_t callcnt = 0; int32_t ret = -1; long cky = 0; - char *pathinfo = NULL; - char *pathinfo_serz = NULL; - int32_t padding = 0; - int32_t tlen = 0; - char stripe_size_str[20] = {0,}; + void *xattr_val = NULL; + void *xattr_serz = NULL; stripe_xattr_sort_t *xattr = NULL; dict_t *stripe_xattr = NULL; if (!frame || !frame->local || !this) { - gf_log (this->name, GF_LOG_ERROR, "Possible NULL deref"); + gf_log ("", GF_LOG_ERROR, "Possible NULL deref"); return ret; } local = frame->local; cky = (long) cookie; + if (local->xsel[0] == '\0') { + gf_log (this->name, GF_LOG_ERROR, "Empty xattr in cbk"); + return ret; + } + LOCK (&frame->lock); { callcnt = --local->wind_count; @@ -4602,23 +5404,26 @@ stripe_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, goto out; if (!local->xattr_list) - local->xattr_list = (stripe_xattr_sort_t *) GF_CALLOC (local->nallocs, - sizeof (stripe_xattr_sort_t), - gf_stripe_mt_xattr_sort_t); + local->xattr_list = (stripe_xattr_sort_t *) + GF_CALLOC (local->nallocs, + sizeof (stripe_xattr_sort_t), + gf_stripe_mt_xattr_sort_t); if (local->xattr_list) { - ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); - if (ret) - goto out; - xattr = local->xattr_list + (int32_t) cky; - pathinfo = gf_strdup (pathinfo); + ret = dict_get_ptr_and_len (dict, local->xsel, + &xattr_val, + &xattr->xattr_len); + if (xattr->xattr_len == 0) + goto out; + xattr->pos = cky; - xattr->pathinfo = pathinfo; - xattr->pathinfo_len = strlen (pathinfo); + xattr->xattr_value = gf_memdup (xattr_val, + xattr->xattr_len); - local->xattr_total_len += strlen (pathinfo) + 1; + if (xattr->xattr_value != NULL) + local->xattr_total_len += xattr->xattr_len + 1; } } out: @@ -4632,41 +5437,36 @@ stripe_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, if (!stripe_xattr) goto unwind; - snprintf (stripe_size_str, 20, "%ld", local->stripe_size); - - /* extra bytes for decorations (brackets and <>'s) */ - padding = strlen (this->name) + strlen (STRIPE_PATHINFO_HEADER) - + strlen (stripe_size_str) + 7; - local->xattr_total_len += (padding + 2); - - pathinfo_serz = GF_CALLOC (local->xattr_total_len, sizeof (char), - gf_common_mt_char); - if (!pathinfo_serz) - goto unwind; - - /* xlator info */ - sprintf (pathinfo_serz, "(<"STRIPE_PATHINFO_HEADER"%s:[%s]> ", this->name, stripe_size_str); - - ret = stripe_pathinfo_aggregate (pathinfo_serz + padding, local, &tlen); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Cannot aggregate pathinfo list"); + /* select filler based on ->xsel */ + if (XATTR_IS_PATHINFO (local->xsel)) + ret = stripe_fill_pathinfo_xattr (this, local, + (char **)&xattr_serz); + else if (XATTR_IS_LOCKINFO (local->xsel)) { + ret = stripe_fill_lockinfo_xattr (this, local, + &xattr_serz); + } else { + gf_log (this->name, GF_LOG_WARNING, + "Unknown xattr in xattr request"); goto unwind; } - *(pathinfo_serz + padding + tlen) = ')'; - *(pathinfo_serz + padding + tlen + 1) = '\0'; - - ret = dict_set_dynstr (stripe_xattr, GF_XATTR_PATHINFO_KEY, pathinfo_serz); - if (ret) - gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo key in dict"); + if (!ret) { + ret = dict_set_dynptr (stripe_xattr, local->xsel, + xattr_serz, + local->xattr_total_len); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Can't set %s key in dict", + local->xsel); + } unwind: - STRIPE_STACK_UNWIND (getxattr, frame, op_ret, op_errno, stripe_xattr); + STRIPE_STACK_UNWIND (getxattr, frame, op_ret, op_errno, + stripe_xattr, NULL); - ret = stripe_free_pathinfo_str (local); + ret = stripe_free_xattr_str (local); - if (local->xattr_list) - GF_FREE (local->xattr_list); + GF_FREE (local->xattr_list); if (stripe_xattr) dict_unref (stripe_xattr); @@ -4677,14 +5477,15 @@ stripe_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, int32_t stripe_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) + loc_t *loc, const char *name, dict_t *xdata) { - stripe_local_t *local = NULL; - xlator_list_t *trav = NULL; - stripe_private_t *priv = NULL; - int32_t op_errno = EINVAL; - int i = 0; - xlator_t **sub_volumes; + stripe_local_t *local = NULL; + xlator_list_t *trav = NULL; + stripe_private_t *priv = NULL; + int32_t op_errno = EINVAL; + int i = 0; + xlator_t **sub_volumes; + int ret = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -4696,8 +5497,7 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this, trav = this->children; /* Initialization */ - local = GF_CALLOC (1, sizeof (stripe_local_t), - gf_stripe_mt_stripe_local_t); + local = mem_get0 (this->local_pool); if (!local) { op_errno = ENOMEM; goto err; @@ -4708,7 +5508,7 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this, if (name && (strcmp (GF_XATTR_MARKER_KEY, name) == 0) - && (-1 == frame->root->pid)) { + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { local->marker.call_count = priv->child_count; sub_volumes = alloca ( priv->child_count * @@ -4723,7 +5523,8 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this, if (cluster_getmarkerattr (frame, this, loc, name, local, stripe_getxattr_unwind, sub_volumes, priv->child_count, - MARKER_UUID_TYPE, priv->vol_uuid)) { + MARKER_UUID_TYPE, marker_uuid_default_gauge, + priv->vol_uuid)) { op_errno = EINVAL; goto err; } @@ -4739,25 +5540,39 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this, trav = trav->next) { STACK_WIND (frame, stripe_getxattr_cbk, trav->xlator, trav->xlator->fops->getxattr, - loc, name); + loc, name, xdata); } return 0; } - if (name && (strncmp (name, GF_XATTR_PATHINFO_KEY, - strlen (GF_XATTR_PATHINFO_KEY)) == 0)) { - local->stripe_size = stripe_get_matching_bs (loc->path, - priv->pattern, - priv->block_size); + if (name && + ((strncmp (name, GF_XATTR_PATHINFO_KEY, + strlen (GF_XATTR_PATHINFO_KEY)) == 0))) { + if (IA_ISREG (loc->inode->ia_type)) { + ret = inode_ctx_get (loc->inode, this, + (uint64_t *) &local->fctx); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "stripe size unavailable from fctx" + " relying on pathinfo could lead to" + " wrong results"); + } + local->nallocs = local->wind_count = priv->child_count; + (void) strncpy (local->xsel, name, strlen (name)); + /** + * for xattrs that need info from all childs, fill ->xsel + * as above and call the filler function in cbk based on + * it + */ for (i = 0, trav = this->children; i < priv->child_count; i++, trav = trav->next) { - STACK_WIND_COOKIE (frame, stripe_getxattr_pathinfo_cbk, + STACK_WIND_COOKIE (frame, stripe_vgetxattr_cbk, (void *) (long) i, trav->xlator, trav->xlator->fops->getxattr, - loc, name); + loc, name, xdata); } return 0; @@ -4765,42 +5580,125 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this, if (name &&(*priv->vol_uuid)) { if ((match_uuid_local (name, priv->vol_uuid) == 0) - && (-1 == frame->root->pid)) { - local->marker.call_count = priv->child_count; + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { + + if (!IA_FILE_OR_DIR (loc->inode->ia_type)) + local->marker.call_count = 1; + else + local->marker.call_count = priv->child_count; - sub_volumes = alloca ( priv->child_count * - sizeof (xlator_t *)); - for (i = 0, trav = this->children; trav ; - trav = trav->next, i++) { + sub_volumes = alloca (local->marker.call_count * + sizeof (xlator_t *)); + for (i = 0, trav = this->children; + i < local->marker.call_count; + i++, trav = trav->next) { *(sub_volumes + i) = trav->xlator; } if (cluster_getmarkerattr (frame, this, loc, name, - local, stripe_getxattr_unwind, + local, + stripe_getxattr_unwind, sub_volumes, - priv->child_count, + local->marker.call_count, MARKER_XTIME_TYPE, + marker_xtime_default_gauge, priv->vol_uuid)) { op_errno = EINVAL; goto err; } + return 0; } } - STACK_WIND (frame, default_getxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getxattr, loc, name); + STACK_WIND (frame, stripe_internal_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); + + return 0; + +err: + STRIPE_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); + return 0; +} + +inline gf_boolean_t +stripe_is_special_xattr (const char *name) +{ + gf_boolean_t is_spl = _gf_false; + + if (!name) { + goto out; + } + + if (!strncmp (name, GF_XATTR_LOCKINFO_KEY, + strlen (GF_XATTR_LOCKINFO_KEY)) + || !strncmp (name, GF_XATTR_PATHINFO_KEY, + strlen (GF_XATTR_PATHINFO_KEY))) + is_spl = _gf_true; +out: + return is_spl; +} + +int32_t +stripe_fgetxattr_from_everyone (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = NULL; + int32_t ret = -1, op_errno = 0; + int i = 0; + xlator_list_t *trav = NULL; + + priv = this->private; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->op_ret = -1; + frame->local = local; + + strncpy (local->xsel, name, strlen (name)); + local->nallocs = local->wind_count = priv->child_count; + + for (i = 0, trav = this->children; i < priv->child_count; i++, + trav = trav->next) { + STACK_WIND_COOKIE (frame, stripe_vgetxattr_cbk, + (void *) (long) i, trav->xlator, + trav->xlator->fops->fgetxattr, + fd, name, xdata); + } return 0; err: - STRIPE_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL); + STACK_UNWIND_STRICT (fgetxattr, frame, -1, op_errno, NULL, NULL); + return ret; +} + +int32_t +stripe_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + if (stripe_is_special_xattr (name)) { + stripe_fgetxattr_from_everyone (frame, this, fd, name, xdata); + goto out; + } + + STACK_WIND (frame, stripe_internal_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); + +out: return 0; } + + int32_t stripe_priv_dump (xlator_t *this) { @@ -4849,36 +5747,44 @@ out: } struct xlator_fops fops = { - .stat = stripe_stat, - .unlink = stripe_unlink, - .rename = stripe_rename, - .link = stripe_link, - .truncate = stripe_truncate, - .create = stripe_create, - .open = stripe_open, - .readv = stripe_readv, - .writev = stripe_writev, - .statfs = stripe_statfs, - .flush = stripe_flush, - .fsync = stripe_fsync, - .ftruncate = stripe_ftruncate, - .fstat = stripe_fstat, - .mkdir = stripe_mkdir, - .rmdir = stripe_rmdir, - .lk = stripe_lk, - .opendir = stripe_opendir, - .fsyncdir = stripe_fsyncdir, - .setattr = stripe_setattr, - .fsetattr = stripe_fsetattr, - .lookup = stripe_lookup, - .mknod = stripe_mknod, - - .getxattr = stripe_getxattr, - .readdirp = stripe_readdirp, + .stat = stripe_stat, + .unlink = stripe_unlink, + .rename = stripe_rename, + .link = stripe_link, + .truncate = stripe_truncate, + .create = stripe_create, + .open = stripe_open, + .readv = stripe_readv, + .writev = stripe_writev, + .statfs = stripe_statfs, + .flush = stripe_flush, + .fsync = stripe_fsync, + .ftruncate = stripe_ftruncate, + .fstat = stripe_fstat, + .mkdir = stripe_mkdir, + .rmdir = stripe_rmdir, + .lk = stripe_lk, + .opendir = stripe_opendir, + .fsyncdir = stripe_fsyncdir, + .setattr = stripe_setattr, + .fsetattr = stripe_fsetattr, + .lookup = stripe_lookup, + .mknod = stripe_mknod, + .setxattr = stripe_setxattr, + .fsetxattr = stripe_fsetxattr, + .getxattr = stripe_getxattr, + .fgetxattr = stripe_fgetxattr, + .removexattr = stripe_removexattr, + .fremovexattr = stripe_fremovexattr, + .readdirp = stripe_readdirp, + .fallocate = stripe_fallocate, + .discard = stripe_discard, + .zerofill = stripe_zerofill, }; struct xlator_cbks cbks = { .release = stripe_release, + .forget = stripe_forget, }; struct xlator_dumpops dumpops = { @@ -4887,8 +5793,9 @@ struct xlator_dumpops dumpops = { struct volume_options options[] = { { .key = {"block-size"}, - .type = GF_OPTION_TYPE_ANY, + .type = GF_OPTION_TYPE_SIZE_LIST, .default_value = "128KB", + .min = STRIPE_MIN_BLOCK_SIZE, .description = "Size of the stripe unit that would be read " "from or written to the striped servers." }, @@ -4896,5 +5803,12 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_BOOL, .default_value = "true" }, + { .key = {"coalesce"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .description = "Enable/Disable coalesce mode to flatten striped " + "files as stored on the server (i.e., eliminate holes " + "caused by the traditional format)." + }, { .key = {NULL} }, }; diff --git a/xlators/cluster/stripe/src/stripe.h b/xlators/cluster/stripe/src/stripe.h index 3ab67d621..5673d18f3 100644 --- a/xlators/cluster/stripe/src/stripe.h +++ b/xlators/cluster/stripe/src/stripe.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -38,36 +29,53 @@ #include <signal.h> #define STRIPE_PATHINFO_HEADER "STRIPE:" - +#define STRIPE_MIN_BLOCK_SIZE (16*GF_UNIT_KB) #define STRIPE_STACK_UNWIND(fop, frame, params ...) do { \ stripe_local_t *__local = NULL; \ - if (frame) { \ - __local = frame->local; \ - frame->local = NULL; \ - } \ - STACK_UNWIND_STRICT (fop, frame, params); \ + if (frame) { \ + __local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT (fop, frame, params); \ + if (__local) { \ + stripe_local_wipe(__local); \ + mem_put (__local); \ + } \ + } while (0) + +#define STRIPE_STACK_DESTROY(frame) do { \ + stripe_local_t *__local = NULL; \ + __local = frame->local; \ + frame->local = NULL; \ + STACK_DESTROY (frame->root); \ if (__local) { \ - stripe_local_wipe(__local); \ - GF_FREE (__local); \ + stripe_local_wipe (__local); \ + mem_put (__local); \ } \ } while (0) -#define STRIPE_STACK_DESTROY(frame) do { \ - stripe_local_t *__local = NULL; \ - __local = frame->local; \ - frame->local = NULL; \ - STACK_DESTROY (frame->root); \ - if (__local) { \ - stripe_local_wipe (__local); \ - GF_FREE (__local); \ - } \ - } while (0) +#define STRIPE_VALIDATE_FCTX(fctx, label) do { \ + int idx = 0; \ + if (!fctx) { \ + op_errno = EINVAL; \ + goto label; \ + } \ + for (idx = 0; idx < fctx->stripe_count; idx++) { \ + if (!fctx->xl_array[idx]) { \ + gf_log (this->name, GF_LOG_ERROR, \ + "fctx->xl_array[%d] is NULL", \ + idx); \ + op_errno = ESTALE; \ + goto label; \ + } \ + } \ + } while (0) typedef struct stripe_xattr_sort { - int32_t pos; - int32_t pathinfo_len; - char *pathinfo; + int pos; + int xattr_len; + char *xattr_value; } stripe_xattr_sort_t; /** @@ -90,16 +98,17 @@ struct stripe_private { gf_lock_t lock; uint8_t nodes_down; int8_t first_child_down; + int *last_event; int8_t child_count; - int8_t *state; /* Current state of child node */ gf_boolean_t xattr_supported; /* default yes */ + gf_boolean_t coalesce; char vol_uuid[UUID_SIZE + 1]; }; /** - * Used to keep info about the replies received from fops->readv calls + * Used to keep info about the replies received from readv/writev calls */ -struct readv_replies { +struct stripe_replies { struct iovec *vector; int32_t count; //count of vector int32_t op_ret; //op_ret of readv @@ -111,6 +120,7 @@ struct readv_replies { typedef struct _stripe_fd_ctx { off_t stripe_size; int stripe_count; + int stripe_coalesce; int static_array; xlator_t **xl_array; } stripe_fd_ctx_t; @@ -146,7 +156,7 @@ struct stripe_local { blkcnt_t preparent_blocks; blkcnt_t postparent_blocks; - struct readv_replies *replies; + struct stripe_replies *replies; struct statvfs statvfs_buf; dir_entry_t *entry; @@ -173,11 +183,12 @@ struct stripe_local { mode_t mode; dev_t rdev; /* For File I/O fops */ - dict_t *dict; + dict_t *xdata; stripe_xattr_sort_t *xattr_list; int32_t xattr_total_len; int32_t nallocs; + char xsel[256]; struct marker_str marker; @@ -194,12 +205,84 @@ struct stripe_local { void *value; struct iobref *iobref; gf_dirent_t entries; + gf_dirent_t *dirent; dict_t *xattr; uuid_t ia_gfid; + + int xflag; + mode_t umask; }; typedef struct stripe_local stripe_local_t; typedef struct stripe_private stripe_private_t; +/* + * Determine the stripe index of a particular frame based on the translator. + */ +static inline int32_t stripe_get_frame_index(stripe_fd_ctx_t *fctx, + call_frame_t *prev) +{ + int32_t i, idx = -1; + + for (i = 0; i < fctx->stripe_count; i++) { + if (fctx->xl_array[i] == prev->this) { + idx = i; + break; + } + } + + return idx; +} + +static inline void stripe_copy_xl_array(xlator_t **dst, xlator_t **src, + int count) +{ + int i; + + for (i = 0; i < count; i++) + dst[i] = src[i]; +} + +void stripe_local_wipe (stripe_local_t *local); +int32_t stripe_ctx_handle (xlator_t *this, call_frame_t *prev, + stripe_local_t *local, dict_t *dict); +void stripe_aggregate_xattr (dict_t *dst, dict_t *src); +int32_t stripe_xattr_request_build (xlator_t *this, dict_t *dict, + uint64_t stripe_size, uint32_t stripe_count, + uint32_t stripe_index, + uint32_t stripe_coalesce); +int32_t stripe_get_matching_bs (const char *path, stripe_private_t *priv); +int set_stripe_block_size (xlator_t *this, stripe_private_t *priv, char *data); +int32_t stripe_iatt_merge (struct iatt *from, struct iatt *to); +int32_t stripe_fill_pathinfo_xattr (xlator_t *this, stripe_local_t *local, + char **xattr_serz); +int32_t stripe_free_xattr_str (stripe_local_t *local); +int32_t stripe_xattr_aggregate (char *buffer, stripe_local_t *local, + int32_t *total); +off_t coalesced_offset(off_t offset, uint64_t stripe_size, int stripe_count); +off_t uncoalesced_size(off_t size, uint64_t stripe_size, int stripe_count, + int stripe_index); +int32_t +stripe_fill_lockinfo_xattr (xlator_t *this, stripe_local_t *local, + void **xattr_serz); + +/* + * Adjust the size attribute for files if coalesce is enabled. + */ +static inline void correct_file_size(struct iatt *buf, stripe_fd_ctx_t *fctx, + call_frame_t *prev) +{ + int index; + + if (!IA_ISREG(buf->ia_type)) + return; + + if (!fctx || !fctx->stripe_coalesce) + return; + + index = stripe_get_frame_index(fctx, prev); + buf->ia_size = uncoalesced_size(buf->ia_size, fctx->stripe_size, + fctx->stripe_count, index); +} #endif /* _STRIPE_H_ */ diff --git a/xlators/cluster/unify/Makefile.am b/xlators/cluster/unify/Makefile.am deleted file mode 100644 index d471a3f92..000000000 --- a/xlators/cluster/unify/Makefile.am +++ /dev/null @@ -1,3 +0,0 @@ -SUBDIRS = src - -CLEANFILES = diff --git a/xlators/cluster/unify/src/Makefile.am b/xlators/cluster/unify/src/Makefile.am deleted file mode 100644 index 2a1fe8372..000000000 --- a/xlators/cluster/unify/src/Makefile.am +++ /dev/null @@ -1,16 +0,0 @@ - -xlator_LTLIBRARIES = unify.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/legacy/cluster - -unify_la_LDFLAGS = -module -avoidversion - -unify_la_SOURCES = unify.c unify-self-heal.c -unify_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -noinst_HEADERS = unify.h - -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) - -CLEANFILES = - diff --git a/xlators/cluster/unify/src/unify-mem-types.h b/xlators/cluster/unify/src/unify-mem-types.h deleted file mode 100644 index 13c9cc1f7..000000000 --- a/xlators/cluster/unify/src/unify-mem-types.h +++ /dev/null @@ -1,41 +0,0 @@ - -/* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - - -#ifndef __UNIFY_MEM_TYPES_H__ -#define __UNIFY_MEM_TYPES_H__ - -#include "mem-types.h" - -enum gf_unify_mem_types_ { - gf_unify_mt_char = gf_common_mt_end + 1, - gf_unify_mt_int16_t, - gf_unify_mt_xlator_t, - gf_unify_mt_unify_private_t, - gf_unify_mt_xlator_list_t, - gf_unify_mt_dir_entry_t, - gf_unify_mt_off_t, - gf_unify_mt_int, - gf_unify_mt_unify_self_heal_struct, - gf_unify_mt_unify_local_t, - gf_unify_mt_end -}; -#endif - diff --git a/xlators/cluster/unify/src/unify-self-heal.c b/xlators/cluster/unify/src/unify-self-heal.c deleted file mode 100644 index f99e4c7c3..000000000 --- a/xlators/cluster/unify/src/unify-self-heal.c +++ /dev/null @@ -1,1239 +0,0 @@ -/* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -/** - * unify-self-heal.c : - * This file implements few functions which enables 'unify' translator - * to be consistent in its behaviour when - * > a node fails, - * > a node gets added, - * > a failed node comes back - * > a new namespace server is added (ie, an fresh namespace server). - * - * This functionality of 'unify' will enable glusterfs to support storage - * system failure, and maintain consistancy. This works both ways, ie, when - * an entry (either file or directory) is found on namespace server, and not - * on storage nodes, its created in storage nodes and vica-versa. - * - * The two fops, where it can be implemented are 'getdents ()' and 'lookup ()' - * - */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "unify.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "common-utils.h" - -int32_t -unify_sh_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count); - -int32_t -unify_sh_ns_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count); - -int32_t -unify_bgsh_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count); - -int32_t -unify_bgsh_ns_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count); - -/** - * unify_local_wipe - free all the extra allocation of local->* here. - */ -static void -unify_local_wipe (unify_local_t *local) -{ - /* Free the strdup'd variables in the local structure */ - if (local->name) { - GF_FREE (local->name); - } - - if (local->sh_struct) { - if (local->sh_struct->offset_list) - GF_FREE (local->sh_struct->offset_list); - - if (local->sh_struct->entry_list) - GF_FREE (local->sh_struct->entry_list); - - if (local->sh_struct->count_list) - GF_FREE (local->sh_struct->count_list); - - GF_FREE (local->sh_struct); - } - - loc_wipe (&local->loc1); - loc_wipe (&local->loc2); -} - -int32_t -unify_sh_setdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - inode_t *inode = NULL; - dict_t *tmp_dict = NULL; - dir_entry_t *prev, *entry, *trav; - - LOCK (&frame->lock); - { - /* if local->call_count == 0, that means, setdents on - * storagenodes is still pending. - */ - if (local->call_count) - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (callcnt == 0) { - if (local->sh_struct->entry_list[0]) { - prev = entry = local->sh_struct->entry_list[0]; - if (!entry) - return 0; - trav = entry->next; - while (trav) { - prev->next = trav->next; - GF_FREE (trav->name); - if (IA_ISLNK (trav->buf.ia_type)) - GF_FREE (trav->link); - GF_FREE (trav); - trav = prev->next; - } - GF_FREE (entry); - } - - if (!local->flags) { - if (local->sh_struct->count_list[0] >= - UNIFY_SELF_HEAL_GETDENTS_COUNT) { - /* count == size, that means, there are more entries - to read from */ - //local->call_count = 0; - local->sh_struct->offset_list[0] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - STACK_WIND (frame, - unify_sh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[0], - GF_GET_DIR_ONLY); - } - } else { - inode = local->loc1.inode; - fd_unref (local->fd); - tmp_dict = local->dict; - - unify_local_wipe (local); - - STACK_UNWIND (frame, local->op_ret, local->op_errno, - inode, &local->stbuf, local->dict, - &local->oldpostparent); - if (tmp_dict) - dict_unref (tmp_dict); - } - } - - return 0; -} - - -int32_t -unify_sh_ns_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count) -{ - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = 0; - unsigned long final = 0; - dir_entry_t *tmp = GF_CALLOC (1, sizeof (dir_entry_t), - gf_unify_mt_dir_entry_t); - - local->sh_struct->entry_list[0] = tmp; - local->sh_struct->count_list[0] = count; - if (entry) { - tmp->next = entry->next; - entry->next = NULL; - } - - if ((count < UNIFY_SELF_HEAL_GETDENTS_COUNT) || !entry) { - final = 1; - } - - LOCK (&frame->lock); - { - /* local->call_count will be '0' till now. make it 1 so, it - can be UNWIND'ed for the last call. */ - local->call_count = priv->child_count; - if (final) - local->flags = 1; - } - UNLOCK (&frame->lock); - - for (index = 0; index < priv->child_count; index++) - { - STACK_WIND_COOKIE (frame, - unify_sh_setdents_cbk, - (void *)index, - priv->xl_array[index], - priv->xl_array[index]->fops->setdents, - local->fd, GF_SET_DIR_ONLY, - local->sh_struct->entry_list[0], count); - } - - return 0; -} - -int32_t -unify_sh_ns_setdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = (long)cookie; - dir_entry_t *prev, *entry, *trav; - - LOCK (&frame->lock); - { - if (local->sh_struct->entry_list[index]) { - prev = entry = local->sh_struct->entry_list[index]; - trav = entry->next; - while (trav) { - prev->next = trav->next; - GF_FREE (trav->name); - if (IA_ISLNK (trav->buf.ia_type)) - GF_FREE (trav->link); - GF_FREE (trav); - trav = prev->next; - } - GF_FREE (entry); - } - } - UNLOCK (&frame->lock); - - if (local->sh_struct->count_list[index] < - UNIFY_SELF_HEAL_GETDENTS_COUNT) { - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - } else { - /* count == size, that means, there are more entries - to read from */ - local->sh_struct->offset_list[index] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - STACK_WIND_COOKIE (frame, - unify_sh_getdents_cbk, - cookie, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[index], - GF_GET_ALL); - - gf_log (this->name, GF_LOG_DEBUG, - "readdir on (%s) with offset %"PRId64"", - priv->xl_array[index]->name, - local->sh_struct->offset_list[index]); - } - - if (!callcnt) { - /* All storage nodes have done unified setdents on NS node. - * Now, do getdents from NS and do setdents on storage nodes. - */ - - /* sh_struct->offset_list is no longer required for - storage nodes now */ - local->sh_struct->offset_list[0] = 0; /* reset */ - - STACK_WIND (frame, - unify_sh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_DIR_ONLY); - } - - return 0; -} - - -/** - * unify_sh_getdents_cbk - - */ -int32_t -unify_sh_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = (long)cookie; - dir_entry_t *tmp = NULL; - - if (op_ret >= 0 && count > 0) { - /* There is some dentry found, just send the dentry to NS */ - tmp = GF_CALLOC (1, sizeof (dir_entry_t), - gf_unify_mt_dir_entry_t); - local->sh_struct->entry_list[index] = tmp; - local->sh_struct->count_list[index] = count; - if (entry) { - tmp->next = entry->next; - entry->next = NULL; - } - STACK_WIND_COOKIE (frame, - unify_sh_ns_setdents_cbk, - cookie, - NS(this), - NS(this)->fops->setdents, - local->fd, - GF_SET_IF_NOT_PRESENT, - local->sh_struct->entry_list[index], - count); - return 0; - } - - if (count < UNIFY_SELF_HEAL_GETDENTS_COUNT) { - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - } else { - /* count == size, that means, there are more entries - to read from */ - local->sh_struct->offset_list[index] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - STACK_WIND_COOKIE (frame, - unify_sh_getdents_cbk, - cookie, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[index], - GF_GET_ALL); - - gf_log (this->name, GF_LOG_DEBUG, - "readdir on (%s) with offset %"PRId64"", - priv->xl_array[index]->name, - local->sh_struct->offset_list[index]); - } - - if (!callcnt) { - /* All storage nodes have done unified setdents on NS node. - * Now, do getdents from NS and do setdents on storage nodes. - */ - - /* sh_struct->offset_list is no longer required for - storage nodes now */ - local->sh_struct->offset_list[0] = 0; /* reset */ - - STACK_WIND (frame, - unify_sh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_DIR_ONLY); - } - - return 0; -} - -/** - * unify_sh_opendir_cbk - - * - * @cookie: - */ -int32_t -unify_sh_opendir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - int32_t callcnt = 0; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t index = 0; - inode_t *inode = NULL; - dict_t *tmp_dict = NULL; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret >= 0) { - local->op_ret = op_ret; - } else { - gf_log (this->name, GF_LOG_WARNING, "failed"); - local->failed = 1; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local->call_count = priv->child_count + 1; - - if (!local->failed) { - /* send getdents() namespace after finishing - storage nodes */ - local->call_count--; - - fd_bind (fd); - - if (local->call_count) { - /* Used as the offset index. This list keeps - * track of offset sent to each node during - * STACK_WIND. - */ - local->sh_struct->offset_list = - GF_CALLOC (priv->child_count, - sizeof (off_t), - gf_unify_mt_off_t); - ERR_ABORT (local->sh_struct->offset_list); - - local->sh_struct->entry_list = - GF_CALLOC (priv->child_count, - sizeof (dir_entry_t *), - gf_unify_mt_dir_entry_t); - ERR_ABORT (local->sh_struct->entry_list); - - local->sh_struct->count_list = - GF_CALLOC (priv->child_count, - sizeof (int), - gf_unify_mt_int); - ERR_ABORT (local->sh_struct->count_list); - - /* Send getdents on all the fds */ - for (index = 0; - index < priv->child_count; index++) { - STACK_WIND_COOKIE (frame, - unify_sh_getdents_cbk, - (void *)(long)index, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_ALL); - } - - /* did stack wind, so no need to unwind here */ - return 0; - } /* (local->call_count) */ - } /* (!local->failed) */ - - /* Opendir failed on one node. */ - inode = local->loc1.inode; - fd_unref (local->fd); - tmp_dict = local->dict; - - unify_local_wipe (local); - /* Only 'self-heal' failed, lookup() was successful. */ - local->op_ret = 0; - - /* This is lookup_cbk ()'s UNWIND. */ - STACK_UNWIND (frame, local->op_ret, local->op_errno, inode, - &local->stbuf, local->dict, &local->oldpostparent); - if (tmp_dict) - dict_unref (tmp_dict); - } - - return 0; -} - -/** - * gf_sh_checksum_cbk - - * - * @frame: frame used in lookup. get a copy of it, and use that copy. - * @this: pointer to unify xlator. - * @inode: pointer to inode, for which the consistency check is required. - * - */ -int32_t -unify_sh_checksum_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - uint8_t *file_checksum, - uint8_t *dir_checksum) -{ - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t index = 0; - int32_t callcnt = 0; - inode_t *inode = NULL; - dict_t *tmp_dict = NULL; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret >= 0) { - if (NS(this) == (xlator_t *)cookie) { - memcpy (local->sh_struct->ns_file_checksum, - file_checksum, NAME_MAX); - memcpy (local->sh_struct->ns_dir_checksum, - dir_checksum, NAME_MAX); - } else { - if (local->entry_count == 0) { - /* Initialize the dir_checksum to be - * used for comparision with other - * storage nodes. Should be done for - * the first successful call *only*. - */ - /* Using 'entry_count' as a flag */ - local->entry_count = 1; - memcpy (local->sh_struct->dir_checksum, - dir_checksum, NAME_MAX); - } - - /* Reply from the storage nodes */ - for (index = 0; - index < NAME_MAX; index++) { - /* Files should be present in - only one node */ - local->sh_struct->file_checksum[index] ^= file_checksum[index]; - - /* directory structure should be - same accross */ - if (local->sh_struct->dir_checksum[index] != dir_checksum[index]) - local->failed = 1; - } - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - for (index = 0; index < NAME_MAX ; index++) { - if (local->sh_struct->file_checksum[index] != - local->sh_struct->ns_file_checksum[index]) { - local->failed = 1; - break; - } - if (local->sh_struct->dir_checksum[index] != - local->sh_struct->ns_dir_checksum[index]) { - local->failed = 1; - break; - } - } - - if (local->failed) { - /* Log it, it should be a rare event */ - gf_log (this->name, GF_LOG_WARNING, - "Self-heal triggered on directory %s", - local->loc1.path); - - /* Any self heal will be done at directory level */ - local->call_count = 0; - local->op_ret = -1; - local->failed = 0; - - local->fd = fd_create (local->loc1.inode, - frame->root->pid); - - local->call_count = priv->child_count + 1; - - for (index = 0; - index < (priv->child_count + 1); index++) { - STACK_WIND_COOKIE (frame, - unify_sh_opendir_cbk, - priv->xl_array[index]->name, - priv->xl_array[index], - priv->xl_array[index]->fops->opendir, - &local->loc1, - local->fd); - } - /* opendir can be done on the directory */ - return 0; - } - - /* no mismatch */ - inode = local->loc1.inode; - tmp_dict = local->dict; - - unify_local_wipe (local); - - /* This is lookup_cbk ()'s UNWIND. */ - STACK_UNWIND (frame, - local->op_ret, - local->op_errno, - inode, - &local->stbuf, - local->dict, &local->oldpostparent); - if (tmp_dict) - dict_unref (tmp_dict); - } - - return 0; -} - -/* Foreground self-heal part over */ - -/* Background self-heal part */ - -int32_t -unify_bgsh_setdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - dir_entry_t *prev, *entry, *trav; - - LOCK (&frame->lock); - { - /* if local->call_count == 0, that means, setdents - on storagenodes is still pending. */ - if (local->call_count) - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - - if (callcnt == 0) { - if (local->sh_struct->entry_list[0]) { - prev = entry = local->sh_struct->entry_list[0]; - trav = entry->next; - while (trav) { - prev->next = trav->next; - GF_FREE (trav->name); - if (IA_ISLNK (trav->buf.ia_type)) - GF_FREE (trav->link); - GF_FREE (trav); - trav = prev->next; - } - GF_FREE (entry); - } - - if (!local->flags) { - if (local->sh_struct->count_list[0] >= - UNIFY_SELF_HEAL_GETDENTS_COUNT) { - /* count == size, that means, there are more - entries to read from */ - //local->call_count = 0; - local->sh_struct->offset_list[0] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - STACK_WIND (frame, - unify_bgsh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[0], - GF_GET_DIR_ONLY); - } - } else { - fd_unref (local->fd); - unify_local_wipe (local); - STACK_DESTROY (frame->root); - } - } - - return 0; -} - - -int32_t -unify_bgsh_ns_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count) -{ - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = 0; - unsigned long final = 0; - dir_entry_t *tmp = GF_CALLOC (1, sizeof (dir_entry_t), - gf_unify_mt_dir_entry_t); - - local->sh_struct->entry_list[0] = tmp; - local->sh_struct->count_list[0] = count; - if (entry) { - tmp->next = entry->next; - entry->next = NULL; - } - - if ((count < UNIFY_SELF_HEAL_GETDENTS_COUNT) || !entry) { - final = 1; - } - - LOCK (&frame->lock); - { - /* local->call_count will be '0' till now. make it 1 so, - it can be UNWIND'ed for the last call. */ - local->call_count = priv->child_count; - if (final) - local->flags = 1; - } - UNLOCK (&frame->lock); - - for (index = 0; index < priv->child_count; index++) - { - STACK_WIND_COOKIE (frame, - unify_bgsh_setdents_cbk, - (void *)index, - priv->xl_array[index], - priv->xl_array[index]->fops->setdents, - local->fd, GF_SET_DIR_ONLY, - local->sh_struct->entry_list[0], count); - } - - return 0; -} - -int32_t -unify_bgsh_ns_setdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = (long)cookie; - dir_entry_t *prev, *entry, *trav; - - if (local->sh_struct->entry_list[index]) { - prev = entry = local->sh_struct->entry_list[index]; - if (!entry) - return 0; - trav = entry->next; - while (trav) { - prev->next = trav->next; - GF_FREE (trav->name); - if (IA_ISLNK (trav->buf.ia_type)) - GF_FREE (trav->link); - GF_FREE (trav); - trav = prev->next; - } - GF_FREE (entry); - } - - if (local->sh_struct->count_list[index] < - UNIFY_SELF_HEAL_GETDENTS_COUNT) { - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - } else { - /* count == size, that means, there are more entries - to read from */ - local->sh_struct->offset_list[index] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - STACK_WIND_COOKIE (frame, - unify_bgsh_getdents_cbk, - cookie, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[index], - GF_GET_ALL); - - gf_log (this->name, GF_LOG_DEBUG, - "readdir on (%s) with offset %"PRId64"", - priv->xl_array[index]->name, - local->sh_struct->offset_list[index]); - } - - if (!callcnt) { - /* All storage nodes have done unified setdents on NS node. - * Now, do getdents from NS and do setdents on storage nodes. - */ - - /* sh_struct->offset_list is no longer required for - storage nodes now */ - local->sh_struct->offset_list[0] = 0; /* reset */ - - STACK_WIND (frame, - unify_bgsh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_DIR_ONLY); - } - - return 0; -} - - -/** - * unify_bgsh_getdents_cbk - - */ -int32_t -unify_bgsh_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count) -{ - int32_t callcnt = -1; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - long index = (long)cookie; - dir_entry_t *tmp = NULL; - - if (op_ret >= 0 && count > 0) { - /* There is some dentry found, just send the dentry to NS */ - tmp = GF_CALLOC (1, sizeof (dir_entry_t), - gf_unify_mt_dir_entry_t); - local->sh_struct->entry_list[index] = tmp; - local->sh_struct->count_list[index] = count; - if (entry) { - tmp->next = entry->next; - entry->next = NULL; - } - STACK_WIND_COOKIE (frame, - unify_bgsh_ns_setdents_cbk, - cookie, - NS(this), - NS(this)->fops->setdents, - local->fd, - GF_SET_IF_NOT_PRESENT, - local->sh_struct->entry_list[index], - count); - return 0; - } - - if (count < UNIFY_SELF_HEAL_GETDENTS_COUNT) { - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - } else { - /* count == size, that means, there are more entries to read from */ - local->sh_struct->offset_list[index] += - UNIFY_SELF_HEAL_GETDENTS_COUNT; - - STACK_WIND_COOKIE (frame, - unify_bgsh_getdents_cbk, - cookie, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - local->sh_struct->offset_list[index], - GF_GET_ALL); - - gf_log (this->name, GF_LOG_DEBUG, - "readdir on (%s) with offset %"PRId64"", - priv->xl_array[index]->name, - local->sh_struct->offset_list[index]); - } - - if (!callcnt) { - /* All storage nodes have done unified setdents on NS node. - * Now, do getdents from NS and do setdents on storage nodes. - */ - - /* sh_struct->offset_list is no longer required for - storage nodes now */ - local->sh_struct->offset_list[0] = 0; /* reset */ - - STACK_WIND (frame, - unify_bgsh_ns_getdents_cbk, - NS(this), - NS(this)->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_DIR_ONLY); - } - - return 0; -} - -/** - * unify_bgsh_opendir_cbk - - * - * @cookie: - */ -int32_t -unify_bgsh_opendir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int32_t callcnt = 0; - int16_t index = 0; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret >= 0) { - local->op_ret = op_ret; - } else { - local->failed = 1; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local->call_count = priv->child_count + 1; - - if (!local->failed) { - /* send getdents() namespace after finishing - storage nodes */ - local->call_count--; - callcnt = local->call_count; - - fd_bind (fd); - - if (local->call_count) { - /* Used as the offset index. This list keeps - track of offset sent to each node during - STACK_WIND. */ - local->sh_struct->offset_list = - GF_CALLOC (priv->child_count, - sizeof (off_t), - gf_unify_mt_off_t); - ERR_ABORT (local->sh_struct->offset_list); - - local->sh_struct->entry_list = - GF_CALLOC (priv->child_count, - sizeof (dir_entry_t *), - gf_unify_mt_dir_entry_t); - ERR_ABORT (local->sh_struct->entry_list); - - local->sh_struct->count_list = - GF_CALLOC (priv->child_count, - sizeof (int), - gf_unify_mt_int); - ERR_ABORT (local->sh_struct->count_list); - - /* Send getdents on all the fds */ - for (index = 0; - index < priv->child_count; index++) { - STACK_WIND_COOKIE (frame, - unify_bgsh_getdents_cbk, - (void *)(long)index, - priv->xl_array[index], - priv->xl_array[index]->fops->getdents, - local->fd, - UNIFY_SELF_HEAL_GETDENTS_COUNT, - 0, /* In this call, do send '0' as offset */ - GF_GET_ALL); - } - /* did a stack wind, so no need to unwind here */ - return 0; - } /* (local->call_count) */ - } /* (!local->failed) */ - - /* Opendir failed on one node. */ - fd_unref (local->fd); - - unify_local_wipe (local); - STACK_DESTROY (frame->root); - } - - return 0; -} - -/** - * gf_bgsh_checksum_cbk - - * - * @frame: frame used in lookup. get a copy of it, and use that copy. - * @this: pointer to unify xlator. - * @inode: pointer to inode, for which the consistency check is required. - * - */ -int32_t -unify_bgsh_checksum_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - uint8_t *file_checksum, - uint8_t *dir_checksum) -{ - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t index = 0; - int32_t callcnt = 0; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret >= 0) { - if (NS(this) == (xlator_t *)cookie) { - memcpy (local->sh_struct->ns_file_checksum, - file_checksum, NAME_MAX); - memcpy (local->sh_struct->ns_dir_checksum, - dir_checksum, NAME_MAX); - } else { - if (local->entry_count == 0) { - /* Initialize the dir_checksum to be - * used for comparision with other - * storage nodes. Should be done for - * the first successful call *only*. - */ - /* Using 'entry_count' as a flag */ - local->entry_count = 1; - memcpy (local->sh_struct->dir_checksum, - dir_checksum, NAME_MAX); - } - - /* Reply from the storage nodes */ - for (index = 0; - index < NAME_MAX; index++) { - /* Files should be present in only - one node */ - local->sh_struct->file_checksum[index] ^= file_checksum[index]; - - /* directory structure should be same - accross */ - if (local->sh_struct->dir_checksum[index] != dir_checksum[index]) - local->failed = 1; - } - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - for (index = 0; index < NAME_MAX ; index++) { - if (local->sh_struct->file_checksum[index] != - local->sh_struct->ns_file_checksum[index]) { - local->failed = 1; - break; - } - if (local->sh_struct->dir_checksum[index] != - local->sh_struct->ns_dir_checksum[index]) { - local->failed = 1; - break; - } - } - - if (local->failed) { - /* Log it, it should be a rare event */ - gf_log (this->name, GF_LOG_WARNING, - "Self-heal triggered on directory %s", - local->loc1.path); - - /* Any self heal will be done at the directory level */ - local->op_ret = -1; - local->failed = 0; - - local->fd = fd_create (local->loc1.inode, - frame->root->pid); - local->call_count = priv->child_count + 1; - - for (index = 0; - index < (priv->child_count + 1); index++) { - STACK_WIND_COOKIE (frame, - unify_bgsh_opendir_cbk, - priv->xl_array[index]->name, - priv->xl_array[index], - priv->xl_array[index]->fops->opendir, - &local->loc1, - local->fd); - } - - /* opendir can be done on the directory */ - return 0; - } - - /* no mismatch */ - unify_local_wipe (local); - STACK_DESTROY (frame->root); - } - - return 0; -} - -/* Background self-heal part over */ - - - - -/** - * zr_unify_self_heal - - * - * @frame: frame used in lookup. get a copy of it, and use that copy. - * @this: pointer to unify xlator. - * @inode: pointer to inode, for which the consistency check is required. - * - */ -int32_t -zr_unify_self_heal (call_frame_t *frame, - xlator_t *this, - unify_local_t *local) -{ - unify_private_t *priv = this->private; - call_frame_t *bg_frame = NULL; - unify_local_t *bg_local = NULL; - inode_t *tmp_inode = NULL; - dict_t *tmp_dict = NULL; - int16_t index = 0; - - if (local->inode_generation < priv->inode_generation) { - /* Any self heal will be done at the directory level */ - /* Update the inode's generation to the current generation - value. */ - local->inode_generation = priv->inode_generation; - inode_ctx_put (local->loc1.inode, this, - (uint64_t)(long)local->inode_generation); - - if (priv->self_heal == ZR_UNIFY_FG_SELF_HEAL) { - local->op_ret = 0; - local->failed = 0; - local->call_count = priv->child_count + 1; - local->sh_struct = - GF_CALLOC (1, sizeof (struct unify_self_heal_struct), - gf_unify_mt_unify_self_heal_struct); - - /* +1 is for NS */ - for (index = 0; - index < (priv->child_count + 1); index++) { - STACK_WIND_COOKIE (frame, - unify_sh_checksum_cbk, - priv->xl_array[index], - priv->xl_array[index], - priv->xl_array[index]->fops->checksum, - &local->loc1, - 0); - } - - /* Self-heal in foreground, hence no need - to UNWIND here */ - return 0; - } - - /* Self Heal done in background */ - bg_frame = copy_frame (frame); - INIT_LOCAL (bg_frame, bg_local); - loc_copy (&bg_local->loc1, &local->loc1); - bg_local->op_ret = 0; - bg_local->failed = 0; - bg_local->call_count = priv->child_count + 1; - bg_local->sh_struct = - GF_CALLOC (1, sizeof (struct unify_self_heal_struct), - gf_unify_mt_unify_self_heal_struct); - - /* +1 is for NS */ - for (index = 0; index < (priv->child_count + 1); index++) { - STACK_WIND_COOKIE (bg_frame, - unify_bgsh_checksum_cbk, - priv->xl_array[index], - priv->xl_array[index], - priv->xl_array[index]->fops->checksum, - &bg_local->loc1, - 0); - } - } - - /* generation number matches, self heal already done or - * self heal done in background: just do STACK_UNWIND - */ - tmp_inode = local->loc1.inode; - tmp_dict = local->dict; - - unify_local_wipe (local); - - /* This is lookup_cbk ()'s UNWIND. */ - STACK_UNWIND (frame, - local->op_ret, - local->op_errno, - tmp_inode, - &local->stbuf, - local->dict, - &local->oldpostparent); - - if (tmp_dict) - dict_unref (tmp_dict); - - return 0; -} - diff --git a/xlators/cluster/unify/src/unify.c b/xlators/cluster/unify/src/unify.c deleted file mode 100644 index 6dc93083d..000000000 --- a/xlators/cluster/unify/src/unify.c +++ /dev/null @@ -1,4589 +0,0 @@ -/* - Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -/** - * xlators/cluster/unify: - * - This xlator is one of the main translator in GlusterFS, which - * actually does the clustering work of the file system. One need to - * understand that, unify assumes file to be existing in only one of - * the child node, and directories to be present on all the nodes. - * - * NOTE: - * Now, unify has support for global namespace, which is used to keep a - * global view of fs's namespace tree. The stat for directories are taken - * just from the namespace, where as for files, just 'ia_ino' is taken from - * Namespace node, and other stat info is taken from the actual storage node. - * Also Namespace node helps to keep consistant inode for files across - * glusterfs (re-)mounts. - */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "unify.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "defaults.h" -#include "common-utils.h" -#include <signal.h> -#include <libgen.h> -#include "compat-errno.h" -#include "compat.h" - -#define UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR(_loc) do { \ - if (!(_loc && _loc->inode)) { \ - STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL); \ - return 0; \ - } \ -} while(0) - - -#define UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR(_fd) do { \ - if (!(_fd && !fd_ctx_get (_fd, this, NULL))) { \ - STACK_UNWIND (frame, -1, EBADFD, NULL, NULL); \ - return 0; \ - } \ -} while(0) - -#define UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(_fd) do { \ - if (!_fd) { \ - STACK_UNWIND (frame, -1, EBADFD, NULL, NULL); \ - return 0; \ - } \ -} while(0) - -/** - * unify_local_wipe - free all the extra allocation of local->* here. - */ -static void -unify_local_wipe (unify_local_t *local) -{ - /* Free the strdup'd variables in the local structure */ - if (local->name) { - GF_FREE (local->name); - } - loc_wipe (&local->loc1); - loc_wipe (&local->loc2); -} - - - -/* - * unify_normalize_stats - - */ -void -unify_normalize_stats (struct statvfs *buf, - unsigned long bsize, - unsigned long frsize) -{ - double factor; - - if (buf->f_bsize != bsize) { - factor = ((double) buf->f_bsize) / bsize; - buf->f_bsize = bsize; - buf->f_bfree = (fsblkcnt_t) (factor * buf->f_bfree); - buf->f_bavail = (fsblkcnt_t) (factor * buf->f_bavail); - } - - if (buf->f_frsize != frsize) { - factor = ((double) buf->f_frsize) / frsize; - buf->f_frsize = frsize; - buf->f_blocks = (fsblkcnt_t) (factor * buf->f_blocks); - } -} - - -xlator_t * -unify_loc_subvol (loc_t *loc, xlator_t *this) -{ - unify_private_t *priv = NULL; - xlator_t *subvol = NULL; - int16_t *list = NULL; - long index = 0; - xlator_t *subvol_i = NULL; - int ret = 0; - uint64_t tmp_list = 0; - - priv = this->private; - subvol = NS (this); - - if (!IA_ISDIR (loc->inode->ia_type)) { - ret = inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - if (!list) - goto out; - - for (index = 0; list[index] != -1; index++) { - subvol_i = priv->xl_array[list[index]]; - if (subvol_i != NS (this)) { - subvol = subvol_i; - break; - } - } - } -out: - return subvol; -} - - - -/** - * unify_statfs_cbk - - */ -int32_t -unify_statfs_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct statvfs *stbuf) -{ - int32_t callcnt = 0; - struct statvfs *dict_buf = NULL; - unsigned long bsize; - unsigned long frsize; - unify_local_t *local = (unify_local_t *)frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - if (op_ret >= 0) { - /* when a call is successfull, add it to local->dict */ - dict_buf = &local->statvfs_buf; - - if (dict_buf->f_bsize != 0) { - bsize = max (dict_buf->f_bsize, - stbuf->f_bsize); - - frsize = max (dict_buf->f_frsize, - stbuf->f_frsize); - unify_normalize_stats(dict_buf, bsize, frsize); - unify_normalize_stats(stbuf, bsize, frsize); - } else { - dict_buf->f_bsize = stbuf->f_bsize; - dict_buf->f_frsize = stbuf->f_frsize; - } - - dict_buf->f_blocks += stbuf->f_blocks; - dict_buf->f_bfree += stbuf->f_bfree; - dict_buf->f_bavail += stbuf->f_bavail; - dict_buf->f_files += stbuf->f_files; - dict_buf->f_ffree += stbuf->f_ffree; - dict_buf->f_favail += stbuf->f_favail; - dict_buf->f_fsid = stbuf->f_fsid; - dict_buf->f_flag = stbuf->f_flag; - dict_buf->f_namemax = stbuf->f_namemax; - local->op_ret = op_ret; - } else { - /* fop on storage node has failed due to some error */ - if (op_errno != ENOTCONN) { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): %s", - prev_frame->this->name, - strerror (op_errno)); - } - local->op_errno = op_errno; - } - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->statvfs_buf); - } - - return 0; -} - -/** - * unify_statfs - - */ -int32_t -unify_statfs (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - unify_local_t *local = NULL; - xlator_list_t *trav = this->children; - - INIT_LOCAL (frame, local); - local->call_count = ((unify_private_t *)this->private)->child_count; - - while(trav) { - STACK_WIND (frame, - unify_statfs_cbk, - trav->xlator, - trav->xlator->fops->statfs, - loc); - trav = trav->next; - } - - return 0; -} - -/** - * unify_buf_cbk - - */ -int32_t -unify_buf_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "%s(): child(%s): path(%s): %s", - gf_fop_list[frame->root->op], - prev_frame->this->name, - (local->loc1.path)?local->loc1.path:"", - strerror (op_errno)); - - local->op_errno = op_errno; - if ((op_errno == ENOENT) && priv->optimist) - local->op_ret = 0; - } - - if (op_ret >= 0) { - local->op_ret = 0; - - if (NS (this) == prev_frame->this) { - local->ia_ino = buf->ia_ino; - /* If the entry is directory, get the stat - from NS node */ - if (IA_ISDIR (buf->ia_type) || - !local->stbuf.ia_blksize) { - local->stbuf = *buf; - } - } - - if ((!IA_ISDIR (buf->ia_type)) && - (NS (this) != prev_frame->this)) { - /* If file, take the stat info from Storage - node. */ - local->stbuf = *buf; - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - /* If the inode number is not filled, operation should - fail */ - if (!local->ia_ino) - local->op_ret = -1; - - local->stbuf.ia_ino = local->ia_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stbuf); - } - - return 0; -} - -#define check_if_dht_linkfile(s) \ - ((st_mode_from_ia (s->ia_prot, s->ia_type) & ~S_IFMT) == S_ISVTX) - -/** - * unify_lookup_cbk - - */ -int32_t -unify_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *dict, - struct iatt *postparent) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - inode_t *tmp_inode = NULL; - dict_t *local_dict = NULL; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - if (local->revalidate && - (op_errno == ESTALE)) { - /* ESTALE takes priority */ - local->op_errno = op_errno; - local->failed = 1; - } - - if ((op_errno != ENOTCONN) - && (op_errno != ENOENT) - && (local->op_errno != ESTALE)) { - /* if local->op_errno is already ESTALE, then - * ESTALE has to propogated to the parent first. - * do not enter here. - */ - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - priv->xl_array[(long)cookie]->name, - local->loc1.path, strerror (op_errno)); - local->op_errno = op_errno; - local->failed = 1; - - } else if (local->revalidate && - (local->op_errno != ESTALE) && - !(priv->optimist && (op_errno == ENOENT))) { - - gf_log (this->name, - (op_errno == ENOTCONN) ? - GF_LOG_DEBUG:GF_LOG_ERROR, - "child(%s): path(%s): %s", - priv->xl_array[(long)cookie]->name, - local->loc1.path, strerror (op_errno)); - local->op_errno = op_errno; - local->failed = 1; - } - } - - if (op_ret == 0) { - local->op_ret = 0; - - if (check_if_dht_linkfile(buf)) { - gf_log (this->name, GF_LOG_CRITICAL, - "file %s may be DHT link file on %s, " - "make sure the backend is not shared " - "between unify and DHT", - local->loc1.path, - priv->xl_array[(long)cookie]->name); - } - - if (local->stbuf.ia_type && local->stbuf.ia_blksize) { - /* make sure we already have a stbuf - stored in local->stbuf */ - if (IA_ISDIR (local->stbuf.ia_type) && - !IA_ISDIR (buf->ia_type)) { - gf_log (this->name, GF_LOG_CRITICAL, - "[CRITICAL] '%s' is directory " - "on namespace, non-directory " - "on node '%s', returning EIO", - local->loc1.path, - priv->xl_array[(long)cookie]->name); - local->return_eio = 1; - } - if (!IA_ISDIR (local->stbuf.ia_type) && - IA_ISDIR (buf->ia_type)) { - gf_log (this->name, GF_LOG_CRITICAL, - "[CRITICAL] '%s' is directory " - "on node '%s', non-directory " - "on namespace, returning EIO", - local->loc1.path, - priv->xl_array[(long)cookie]->name); - local->return_eio = 1; - } - } - - if (!local->revalidate && !IA_ISDIR (buf->ia_type)) { - /* This is the first time lookup on file*/ - if (!local->list) { - /* list is not allocated, allocate - the max possible range */ - local->list = GF_CALLOC (1, 2 * (priv->child_count + 2), - gf_unify_mt_int16_t); - if (!local->list) { - gf_log (this->name, - GF_LOG_CRITICAL, - "Not enough memory"); - STACK_UNWIND (frame, -1, - ENOMEM, inode, - NULL, NULL, NULL); - return 0; - } - } - /* update the index of the list */ - local->list [local->index++] = - (int16_t)(long)cookie; - } - - if (!local->revalidate && IA_ISDIR (buf->ia_type)) { - /* fresh lookup of a directory */ - inode_ctx_put (local->loc1.inode, this, - priv->inode_generation); - } - - if ((!local->dict) && dict && - (priv->xl_array[(long)cookie] != NS(this))) { - local->dict = dict_ref (dict); - } - - /* index of NS node is == total child count */ - if (priv->child_count == (int16_t)(long)cookie) { - /* Take the inode number from namespace */ - local->ia_ino = buf->ia_ino; - if (IA_ISDIR (buf->ia_type) || - !(local->stbuf.ia_blksize)) { - local->stbuf = *buf; - local->oldpostparent = *postparent; - } - } else if (!IA_ISDIR (buf->ia_type)) { - /* If file, then get the stat from - storage node */ - local->stbuf = *buf; - } - - if (local->ia_nlink < buf->ia_nlink) { - local->ia_nlink = buf->ia_nlink; - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local_dict = local->dict; - if (local->return_eio) { - gf_log (this->name, GF_LOG_CRITICAL, - "[CRITICAL] Unable to fix the path (%s) with " - "self-heal, try manual verification. " - "returning EIO.", local->loc1.path); - unify_local_wipe (local); - STACK_UNWIND (frame, -1, EIO, inode, NULL, NULL); - if (local_dict) { - dict_unref (local_dict); - } - return 0; - } - - if (!local->stbuf.ia_blksize) { - /* Inode not present */ - local->op_ret = -1; - } else { - if (!local->revalidate && - !IA_ISDIR (local->stbuf.ia_type)) { - /* If its a file, big array is useless, - allocate the smaller one */ - int16_t *list = NULL; - list = GF_CALLOC (1, 2 * (local->index + 1), - gf_unify_mt_int16_t); - ERR_ABORT (list); - memcpy (list, local->list, 2 * local->index); - /* Make the end of the list as -1 */ - GF_FREE (local->list); - local->list = list; - local->list [local->index] = -1; - /* Update the inode's ctx with proper array */ - /* TODO: log on failure */ - inode_ctx_put (local->loc1.inode, this, - (uint64_t)(long)local->list); - } - - if (IA_ISDIR(local->loc1.inode->ia_type)) { - /* lookup is done for directory */ - if (local->failed && priv->self_heal) { - /* Triggering self-heal */ - /* means, self-heal required for this - inode */ - local->inode_generation = 0; - priv->inode_generation++; - } - } else { - local->stbuf.ia_ino = local->ia_ino; - } - - local->stbuf.ia_nlink = local->ia_nlink; - } - if (local->op_ret == -1) { - if (!local->revalidate && local->list) - GF_FREE (local->list); - } - - if ((local->op_ret >= 0) && local->failed && - local->revalidate) { - /* Done revalidate, but it failed */ - if ((op_errno != ENOTCONN) - && (local->op_errno != ESTALE)) { - gf_log (this->name, GF_LOG_ERROR, - "Revalidate failed for path(%s): %s", - local->loc1.path, strerror (op_errno)); - } - local->op_ret = -1; - } - - if ((priv->self_heal && !priv->optimist) && - (!local->revalidate && (local->op_ret == 0) && - IA_ISDIR(local->stbuf.ia_type))) { - /* Let the self heal be done here */ - zr_unify_self_heal (frame, this, local); - local_dict = NULL; - } else { - if (local->failed) { - /* NOTE: directory lookup is sent to all - * subvolumes and success from a subvolume - * might set local->op_ret to 0 (zero) */ - local->op_ret = -1; - } - - /* either no self heal, or op_ret == -1 (failure) */ - tmp_inode = local->loc1.inode; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - tmp_inode, &local->stbuf, local->dict, - &local->oldpostparent); - } - if (local_dict) { - dict_unref (local_dict); - } - } - - return 0; -} - -/** - * unify_lookup - - */ -int32_t -unify_lookup (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *xattr_req) -{ - unify_local_t *local = NULL; - unify_private_t *priv = this->private; - int16_t *list = NULL; - long index = 0; - - if (!(loc && loc->inode)) { - gf_log (this->name, GF_LOG_ERROR, - "%s: Argument not right", loc?loc->path:"(null)"); - STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL, NULL); - return 0; - } - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL, NULL, NULL); - return 0; - } - - if (inode_ctx_get (loc->inode, this, NULL) - && IA_ISDIR (loc->inode->ia_type)) { - local->revalidate = 1; - } - - if (!inode_ctx_get (loc->inode, this, NULL) && - loc->inode->ia_type && - !IA_ISDIR (loc->inode->ia_type)) { - uint64_t tmp_list = 0; - /* check if revalidate or fresh lookup */ - inode_ctx_get (loc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - } - - if (local->list) { - list = local->list; - for (index = 0; list[index] != -1; index++); - if (index != 2) { - if (index < 2) { - gf_log (this->name, GF_LOG_ERROR, - "returning ESTALE for %s: file " - "count is %ld", loc->path, index); - /* Print where all the file is present */ - for (index = 0; - local->list[index] != -1; index++) { - gf_log (this->name, GF_LOG_ERROR, - "%s: found on %s", loc->path, - priv->xl_array[list[index]]->name); - } - unify_local_wipe (local); - STACK_UNWIND (frame, -1, ESTALE, - NULL, NULL, NULL, NULL); - return 0; - } else { - /* There are more than 2 presences */ - /* Just log and continue */ - gf_log (this->name, GF_LOG_ERROR, - "%s: file count is %ld", - loc->path, index); - /* Print where all the file is present */ - for (index = 0; - local->list[index] != -1; index++) { - gf_log (this->name, GF_LOG_ERROR, - "%s: found on %s", loc->path, - priv->xl_array[list[index]]->name); - } - } - } - - /* is revalidate */ - local->revalidate = 1; - - for (index = 0; list[index] != -1; index++) - local->call_count++; - - for (index = 0; list[index] != -1; index++) { - char need_break = (list[index+1] == -1); - STACK_WIND_COOKIE (frame, - unify_lookup_cbk, - (void *)(long)list[index], //cookie - priv->xl_array [list[index]], - priv->xl_array [list[index]]->fops->lookup, - loc, - xattr_req); - if (need_break) - break; - } - } else { - if (loc->inode->ia_type) { - if (inode_ctx_get (loc->inode, this, NULL)) { - inode_ctx_get (loc->inode, this, - &local->inode_generation); - } - } - /* This is first call, there is no list */ - /* call count should be all child + 1 namespace */ - local->call_count = priv->child_count + 1; - - for (index = 0; index <= priv->child_count; index++) { - STACK_WIND_COOKIE (frame, - unify_lookup_cbk, - (void *)index, //cookie - priv->xl_array[index], - priv->xl_array[index]->fops->lookup, - loc, - xattr_req); - } - } - - return 0; -} - -/** - * unify_stat - if directory, get the stat directly from NameSpace child. - * if file, check for a hint and send it only there (also to NS). - * if its a fresh stat, then do it on all the nodes. - * - * NOTE: for all the call, sending cookie as xlator pointer, which will be - * used in cbk. - */ -int32_t -unify_stat (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - unify_local_t *local = NULL; - unify_private_t *priv = this->private; - int16_t index = 0; - int16_t *list = NULL; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, NULL); - return 0; - } - local->ia_ino = loc->inode->ino; - if (IA_ISDIR (loc->inode->ia_type)) { - /* Directory */ - local->call_count = 1; - STACK_WIND (frame, unify_buf_cbk, NS(this), - NS(this)->fops->stat, loc); - } else { - /* File */ - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) - local->call_count++; - - for (index = 0; list[index] != -1; index++) { - char need_break = (list[index+1] == -1); - STACK_WIND (frame, - unify_buf_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->stat, - loc); - if (need_break) - break; - } - } - - return 0; -} - -/** - * unify_access_cbk - - */ -int32_t -unify_access_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - - -/** - * unify_access - Send request to only namespace, which has all the - * attributes set for the file. - */ -int32_t -unify_access (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t mask) -{ - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - STACK_WIND (frame, - unify_access_cbk, - NS(this), - NS(this)->fops->access, - loc, - mask); - - return 0; -} - -int32_t -unify_mkdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - inode_t *tmp_inode = NULL; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if ((op_ret == -1) && !(priv->optimist && - (op_errno == ENOENT || - op_errno == EEXIST))) { - /* TODO: Decrement the inode_generation of - * this->inode's parent inode, hence the missing - * directory is created properly by self-heal. - * Currently, there is no way to get the parent - * inode directly. - */ - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - priv->xl_array[(long)cookie]->name, - local->loc1.path, strerror (op_errno)); - if (op_errno != EEXIST) - local->failed = 1; - local->op_errno = op_errno; - } - - if (op_ret >= 0) - local->op_ret = 0; - - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (!local->failed) { - inode_ctx_put (local->loc1.inode, this, - priv->inode_generation); - } - - tmp_inode = local->loc1.inode; - unify_local_wipe (local); - - STACK_UNWIND (frame, local->op_ret, local->op_errno, - tmp_inode, &local->stbuf, - &local->oldpreparent, &local->oldpostparent); - } - - return 0; -} - -/** - * unify_ns_mkdir_cbk - - */ -int32_t -unify_ns_mkdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - long index = 0; - - if (op_ret == -1) { - /* No need to send mkdir request to other servers, - * as namespace action failed - */ - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s): %s", - local->name, strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, NULL, - NULL, NULL); - return 0; - } - - /* Create one inode for this entry */ - local->op_ret = 0; - local->stbuf = *buf; - - local->oldpreparent = *preparent; - local->oldpostparent = *postparent; - - local->call_count = priv->child_count; - - /* Send mkdir request to all the nodes now */ - for (index = 0; index < priv->child_count; index++) { - STACK_WIND_COOKIE (frame, - unify_mkdir_cbk, - (void *)index, //cookie - priv->xl_array[index], - priv->xl_array[index]->fops->mkdir, - &local->loc1, - local->mode); - } - - return 0; -} - - -/** - * unify_mkdir - - */ -int32_t -unify_mkdir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode) -{ - unify_local_t *local = NULL; - - /* Initialization */ - INIT_LOCAL (frame, local); - local->mode = mode; - - loc_copy (&local->loc1, loc); - - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL); - return 0; - } - - STACK_WIND (frame, - unify_ns_mkdir_cbk, - NS(this), - NS(this)->fops->mkdir, - loc, - mode); - return 0; -} - -/** - * unify_rmdir_cbk - - */ -int32_t -unify_rmdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret == 0 || (priv->optimist && (op_errno == ENOENT))) - local->op_ret = 0; - if (op_ret == -1) - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->oldpreparent, &local->oldpostparent); - } - - return 0; -} - -/** - * unify_ns_rmdir_cbk - - */ -int32_t -unify_ns_rmdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - int16_t index = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - if (op_ret == -1) { - /* No need to send rmdir request to other servers, - * as namespace action failed - */ - gf_log (this->name, - ((op_errno != ENOTEMPTY) ? - GF_LOG_ERROR : GF_LOG_DEBUG), - "namespace: path(%s): %s", - local->loc1.path, strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, NULL, NULL); - return 0; - } - - local->call_count = priv->child_count; - - local->oldpreparent = *preparent; - local->oldpostparent = *postparent; - - for (index = 0; index < priv->child_count; index++) { - STACK_WIND (frame, - unify_rmdir_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->rmdir, - &local->loc1); - } - - return 0; -} - -/** - * unify_rmdir - - */ -int32_t -unify_rmdir (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - unify_local_t *local = NULL; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - - loc_copy (&local->loc1, loc); - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL); - return 0; - } - - STACK_WIND (frame, - unify_ns_rmdir_cbk, - NS(this), - NS(this)->fops->rmdir, - loc); - - return 0; -} - -/** - * unify_open_cbk - - */ -int32_t -unify_open_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - int32_t callcnt = 0; - unify_local_t *local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret >= 0) { - local->op_ret = op_ret; - if (NS(this) != (xlator_t *)cookie) { - /* Store child node's ptr, used in - all the f*** / FileIO calls */ - fd_ctx_set (fd, this, (uint64_t)(long)cookie); - } - } - if (op_ret == -1) { - local->op_errno = op_errno; - local->failed = 1; - } - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if ((local->failed == 1) && (local->op_ret >= 0)) { - local->call_count = 1; - /* return -1 to user */ - local->op_ret = -1; - //local->op_errno = EIO; - - if (!fd_ctx_get (local->fd, this, NULL)) { - gf_log (this->name, GF_LOG_ERROR, - "Open success on child node, " - "failed on namespace"); - } else { - gf_log (this->name, GF_LOG_ERROR, - "Open success on namespace, " - "failed on child node"); - } - } - - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, - local->op_errno, local->fd); - } - - return 0; -} - -#ifdef GF_DARWIN_HOST_OS -/** - * unify_create_lookup_cbk - - */ -int32_t -unify_open_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *dict, - struct iatt *postparent) -{ - int32_t callcnt = 0; - int16_t index = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if ((op_ret == -1) && (op_errno != ENOENT)) { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - priv->xl_array[(long)cookie]->name, - local->loc1.path, strerror (op_errno)); - local->op_errno = op_errno; - } - - if (op_ret >= 0) { - local->op_ret = op_ret; - local->index++; - if (NS(this) == priv->xl_array[(long)cookie]) { - local->list[0] = (int16_t)(long)cookie; - } else { - local->list[1] = (int16_t)(long)cookie; - } - if (IA_ISDIR (buf->ia_type)) - local->failed = 1; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - int16_t file_list[3] = {0,}; - local->op_ret = -1; - - file_list[0] = local->list[0]; - file_list[1] = local->list[1]; - file_list[2] = -1; - - if (local->index != 2) { - /* Lookup failed, can't do open */ - gf_log (this->name, GF_LOG_ERROR, - "%s: present on %d nodes", - local->name, local->index); - - if (local->index < 2) { - unify_local_wipe (local); - gf_log (this->name, GF_LOG_ERROR, - "returning as file found on less " - "than 2 nodes"); - STACK_UNWIND (frame, local->op_ret, - local->op_errno, local->fd); - return 0; - } - } - - if (local->failed) { - /* Open on directory, return EISDIR */ - unify_local_wipe (local); - STACK_UNWIND (frame, -1, EISDIR, local->fd); - return 0; - } - - /* Everything is perfect :) */ - local->call_count = 2; - - for (index = 0; file_list[index] != -1; index++) { - char need_break = (file_list[index+1] == -1); - STACK_WIND_COOKIE (frame, - unify_open_cbk, - priv->xl_array[file_list[index]], - priv->xl_array[file_list[index]], - priv->xl_array[file_list[index]]->fops->open, - &local->loc1, - local->flags, - local->fd, local->wbflags); - if (need_break) - break; - } - } - - return 0; -} - - -int32_t -unify_open_readlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - const char *path, - struct iatt *sbuf) -{ - int16_t index = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - if (op_ret == -1) { - STACK_UNWIND (frame, -1, ENOENT); - return 0; - } - - if (path[0] == '/') { - local->name = gf_strdup (path); - ERR_ABORT (local->name); - } else { - char *tmp_str = gf_strdup (local->loc1.path); - char *tmp_base = dirname (tmp_str); - local->name = GF_CALLOC (1, ZR_PATH_MAX, gf_unify_mt_char); - strcpy (local->name, tmp_base); - strncat (local->name, "/", 1); - strcat (local->name, path); - GF_FREE (tmp_str); - } - - local->list = GF_CALLOC (1, sizeof (int16_t) * 3, - gf_unify_mt_int16_t); - ERR_ABORT (local->list); - local->call_count = priv->child_count + 1; - local->op_ret = -1; - for (index = 0; index <= priv->child_count; index++) { - /* Send the lookup to all the nodes including namespace */ - STACK_WIND_COOKIE (frame, - unify_open_lookup_cbk, - (void *)(long)index, - priv->xl_array[index], - priv->xl_array[index]->fops->lookup, - &local->loc1, - NULL); - } - - return 0; -} -#endif /* GF_DARWIN_HOST_OS */ - -/** - * unify_open - - */ -int32_t -unify_open (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, - fd_t *fd, - int32_t wbflags) -{ - unify_private_t *priv = this->private; - unify_local_t *local = NULL; - int16_t *list = NULL; - int16_t index = 0; - int16_t file_list[3] = {0,}; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Init */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - local->fd = fd; - local->flags = flags; - local->wbflags = wbflags; - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - local->list = list; - file_list[0] = priv->child_count; /* Thats namespace */ - file_list[2] = -1; - for (index = 0; list[index] != -1; index++) { - local->call_count++; - if (list[index] != priv->child_count) - file_list[1] = list[index]; - } - - if (local->call_count != 2) { - /* If the lookup was done for file */ - gf_log (this->name, GF_LOG_ERROR, - "%s: entry_count is %d", - loc->path, local->call_count); - for (index = 0; local->list[index] != -1; index++) - gf_log (this->name, GF_LOG_ERROR, "%s: found on %s", - loc->path, priv->xl_array[list[index]]->name); - - if (local->call_count < 2) { - gf_log (this->name, GF_LOG_ERROR, - "returning EIO as file found on onlyone node"); - STACK_UNWIND (frame, -1, EIO, fd); - return 0; - } - } - -#ifdef GF_DARWIN_HOST_OS - /* Handle symlink here */ - if (IA_ISLNK (loc->inode->ia_type)) { - /* Callcount doesn't matter here */ - STACK_WIND (frame, - unify_open_readlink_cbk, - NS(this), - NS(this)->fops->readlink, - loc, ZR_PATH_MAX); - return 0; - } -#endif /* GF_DARWIN_HOST_OS */ - - local->call_count = 2; - for (index = 0; file_list[index] != -1; index++) { - char need_break = (file_list[index+1] == -1); - STACK_WIND_COOKIE (frame, - unify_open_cbk, - priv->xl_array[file_list[index]], //cookie - priv->xl_array[file_list[index]], - priv->xl_array[file_list[index]]->fops->open, - loc, - flags, - fd, wbflags); - if (need_break) - break; - } - - return 0; -} - - -int32_t -unify_create_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_local_t *local = frame->local; - inode_t *inode = local->loc1.inode; - - unify_local_wipe (local); - - STACK_UNWIND (frame, local->op_ret, local->op_errno, local->fd, - inode, &local->stbuf, - &local->oldpreparent, &local->oldpostparent); - - return 0; -} - -/** - * unify_create_open_cbk - - */ -int32_t -unify_create_open_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - int ret = 0; - int32_t callcnt = 0; - unify_local_t *local = frame->local; - inode_t *inode = NULL; - xlator_t *child = NULL; - uint64_t tmp_value = 0; - - LOCK (&frame->lock); - { - if (op_ret >= 0) { - local->op_ret = op_ret; - if (NS(this) != (xlator_t *)cookie) { - /* Store child node's ptr, used in all - the f*** / FileIO calls */ - /* TODO: log on failure */ - ret = fd_ctx_get (fd, this, &tmp_value); - cookie = (void *)(long)tmp_value; - } else { - /* NOTE: open successful on namespace. - * fd's ctx can be used to identify open - * failure on storage subvolume. cool - * ide ;) */ - local->failed = 0; - } - } else { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - ((xlator_t *)cookie)->name, - local->loc1.path, strerror (op_errno)); - local->op_errno = op_errno; - local->failed = 1; - } - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed == 1 && (local->op_ret >= 0)) { - local->call_count = 1; - /* return -1 to user */ - local->op_ret = -1; - local->op_errno = EIO; - local->fd = fd; - local->call_count = 1; - - if (!fd_ctx_get (local->fd, this, &tmp_value)) { - child = (xlator_t *)(long)tmp_value; - - gf_log (this->name, GF_LOG_ERROR, - "Create success on child node, " - "failed on namespace"); - - STACK_WIND (frame, - unify_create_unlink_cbk, - child, - child->fops->unlink, - &local->loc1); - } else { - gf_log (this->name, GF_LOG_ERROR, - "Create success on namespace, " - "failed on child node"); - - STACK_WIND (frame, - unify_create_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - } - return 0; - } - inode = local->loc1.inode; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, fd, - inode, &local->stbuf, - &local->oldpreparent, &local->oldpostparent); - } - return 0; -} - -/** - * unify_create_lookup_cbk - - */ -int32_t -unify_create_lookup_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - dict_t *dict, - struct iatt *postparent) -{ - int32_t callcnt = 0; - int16_t index = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - priv->xl_array[(long)cookie]->name, - local->loc1.path, strerror (op_errno)); - local->op_errno = op_errno; - local->failed = 1; - } - - if (op_ret >= 0) { - local->op_ret = op_ret; - local->list[local->index++] = (int16_t)(long)cookie; - if (NS(this) == priv->xl_array[(long)cookie]) { - local->ia_ino = buf->ia_ino; - } else { - local->stbuf = *buf; - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - int16_t *list = local->list; - int16_t file_list[3] = {0,}; - local->op_ret = -1; - - local->list [local->index] = -1; - file_list[0] = list[0]; - file_list[1] = list[1]; - file_list[2] = -1; - - local->stbuf.ia_ino = local->ia_ino; - /* TODO: log on failure */ - inode_ctx_put (local->loc1.inode, this, - (uint64_t)(long)local->list); - - if (local->index != 2) { - /* Lookup failed, can't do open */ - gf_log (this->name, GF_LOG_ERROR, - "%s: present on %d nodes", - local->loc1.path, local->index); - file_list[0] = priv->child_count; - for (index = 0; list[index] != -1; index++) { - gf_log (this->name, GF_LOG_ERROR, - "%s: found on %s", local->loc1.path, - priv->xl_array[list[index]]->name); - if (list[index] != priv->child_count) - file_list[1] = list[index]; - } - - if (local->index < 2) { - unify_local_wipe (local); - gf_log (this->name, GF_LOG_ERROR, - "returning EIO as file found on " - "only one node"); - STACK_UNWIND (frame, -1, EIO, - local->fd, inode, NULL, - NULL, NULL); - return 0; - } - } - /* Everything is perfect :) */ - local->call_count = 2; - - for (index = 0; file_list[index] != -1; index++) { - char need_break = (file_list[index+1] == -1); - STACK_WIND_COOKIE (frame, - unify_create_open_cbk, - priv->xl_array[file_list[index]], - priv->xl_array[file_list[index]], - priv->xl_array[file_list[index]]->fops->open, - &local->loc1, - local->flags, - local->fd, 0); - if (need_break) - break; - } - } - - return 0; -} - - -/** - * unify_create_cbk - - */ -int32_t -unify_create_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - int ret = 0; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - inode_t *tmp_inode = NULL; - - if (op_ret == -1) { - /* send unlink () on Namespace */ - local->op_errno = op_errno; - local->op_ret = -1; - local->call_count = 1; - gf_log (this->name, GF_LOG_ERROR, - "create failed on %s (file %s, error %s), " - "sending unlink to namespace", - prev_frame->this->name, - local->loc1.path, strerror (op_errno)); - - STACK_WIND (frame, - unify_create_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - - return 0; - } - - if (op_ret >= 0) { - local->op_ret = op_ret; - local->stbuf = *buf; - /* Just inode number should be from NS node */ - local->stbuf.ia_ino = local->ia_ino; - - /* TODO: log on failure */ - ret = fd_ctx_set (fd, this, (uint64_t)(long)prev_frame->this); - } - - tmp_inode = local->loc1.inode; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, local->fd, - tmp_inode, &local->stbuf, - &local->oldpreparent, &local->oldpostparent); - - return 0; -} - -/** - * unify_ns_create_cbk - - * - */ -int32_t -unify_ns_create_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - struct sched_ops *sched_ops = NULL; - xlator_t *sched_xl = NULL; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t *list = NULL; - int16_t index = 0; - - if (op_ret == -1) { - /* No need to send create request to other servers, as - namespace action failed. Handle exclusive create here. */ - if ((op_errno != EEXIST) || - ((op_errno == EEXIST) && - ((local->flags & O_EXCL) == O_EXCL))) { - /* If its just a create call without O_EXCL, - don't do this */ - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s): %s", - local->loc1.path, strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf, - preparent, postparent); - return 0; - } - } - - if (op_ret >= 0) { - /* Get the inode number from the NS node */ - local->ia_ino = buf->ia_ino; - - local->oldpreparent = *preparent; - local->oldpostparent = *postparent; - - local->op_ret = -1; - - /* Start the mapping list */ - list = GF_CALLOC (1, sizeof (int16_t) * 3, - gf_unify_mt_int16_t); - ERR_ABORT (list); - inode_ctx_put (inode, this, (uint64_t)(long)list); - list[0] = priv->child_count; - list[2] = -1; - - /* This means, file doesn't exist anywhere in the Filesystem */ - sched_ops = priv->sched_ops; - - /* Send create request to the scheduled node now */ - sched_xl = sched_ops->schedule (this, local->loc1.path); - if (sched_xl == NULL) - { - /* send unlink () on Namespace */ - local->op_errno = ENOTCONN; - local->op_ret = -1; - local->call_count = 1; - gf_log (this->name, GF_LOG_ERROR, - "no node online to schedule create:(file %s) " - "sending unlink to namespace", - (local->loc1.path)?local->loc1.path:""); - - STACK_WIND (frame, - unify_create_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - - return 0; - } - - for (index = 0; index < priv->child_count; index++) - if (sched_xl == priv->xl_array[index]) - break; - list[1] = index; - - STACK_WIND (frame, unify_create_cbk, - sched_xl, sched_xl->fops->create, - &local->loc1, local->flags, local->mode, fd); - } else { - /* File already exists, and there is no O_EXCL flag */ - - gf_log (this->name, GF_LOG_DEBUG, - "File(%s) already exists on namespace, sending " - "open instead", local->loc1.path); - - local->list = GF_CALLOC (1, sizeof (int16_t) * 3, - gf_unify_mt_int16_t); - ERR_ABORT (local->list); - local->call_count = priv->child_count + 1; - local->op_ret = -1; - for (index = 0; index <= priv->child_count; index++) { - /* Send lookup() to all nodes including namespace */ - STACK_WIND_COOKIE (frame, - unify_create_lookup_cbk, - (void *)(long)index, - priv->xl_array[index], - priv->xl_array[index]->fops->lookup, - &local->loc1, - NULL); - } - } - return 0; -} - -/** - * unify_create - create a file in global namespace first, so other - * clients can see them. Create the file in storage nodes in background. - */ -int32_t -unify_create (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, - mode_t mode, - fd_t *fd) -{ - unify_local_t *local = NULL; - - /* Initialization */ - INIT_LOCAL (frame, local); - local->mode = mode; - local->flags = flags; - local->fd = fd; - - loc_copy (&local->loc1, loc); - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, fd, loc->inode, NULL, - NULL, NULL); - return 0; - } - - STACK_WIND (frame, - unify_ns_create_cbk, - NS(this), - NS(this)->fops->create, - loc, - flags | O_EXCL, - mode, - fd); - - return 0; -} - - -/** - * unify_opendir_cbk - - */ -int32_t -unify_opendir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - fd_t *fd) -{ - STACK_UNWIND (frame, op_ret, op_errno, fd); - - return 0; -} - -/** - * unify_opendir - - */ -int32_t -unify_opendir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - fd_t *fd) -{ - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - STACK_WIND (frame, unify_opendir_cbk, - NS(this), NS(this)->fops->opendir, loc, fd); - - return 0; -} - - -int32_t -unify_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *statpre, - struct iatt *statpost) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "%s(): child(%s): path(%s): %s", - gf_fop_list[frame->root->op], - prev_frame->this->name, - (local->loc1.path)?local->loc1.path:"", - strerror (op_errno)); - - local->op_errno = op_errno; - if ((op_errno == ENOENT) && priv->optimist) - local->op_ret = 0; - } - - if (op_ret >= 0) { - local->op_ret = 0; - - if (NS (this) == prev_frame->this) { - local->ia_ino = statpost->ia_ino; - /* If the entry is directory, get the stat - from NS node */ - if (IA_ISDIR (statpost->ia_type) || - !local->stpost.ia_blksize) { - local->stpre = *statpre; - local->stpost = *statpost; - } - } - - if ((!IA_ISDIR (statpost->ia_type)) && - (NS (this) != prev_frame->this)) { - /* If file, take the stat info from Storage - node. */ - local->stpre = *statpre; - local->stpost = *statpost; - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - /* If the inode number is not filled, operation should - fail */ - if (!local->ia_ino) - local->op_ret = -1; - - local->stpre.ia_ino = local->ia_ino; - local->stpost.ia_ino = local->ia_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stpre, &local->stpost); - } - - return 0; -} - - -int32_t -unify_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid) -{ - unify_local_t *local = NULL; - unify_private_t *priv = this->private; - int32_t index = 0; - int32_t callcnt = 0; - uint64_t tmp_list = 0; - - if (!(loc && loc->inode)) { - STACK_UNWIND (frame, -1, EINVAL, NULL, NULL); - return 0; - } - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - - if (IA_ISDIR (loc->inode->ia_type)) { - local->call_count = 1; - - STACK_WIND (frame, - unify_setattr_cbk, - NS (this), - NS (this)->fops->setattr, - loc, stbuf, valid); - } else { - inode_ctx_get (loc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - - for (index = 0; local->list[index] != -1; index++) { - local->call_count++; - callcnt++; - } - - for (index = 0; local->list[index] != -1; index++) { - STACK_WIND (frame, - unify_setattr_cbk, - priv->xl_array[local->list[index]], - priv->xl_array[local->list[index]]->fops->setattr, - loc, stbuf, valid); - - if (!--callcnt) - break; - } - } - - return 0; -} - - -int32_t -unify_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid) -{ - unify_local_t *local = NULL; - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd); - - /* Initialization */ - INIT_LOCAL (frame, local); - - if (!fd_ctx_get (fd, this, &tmp_child)) { - /* If its set, then its file */ - child = (xlator_t *)(long)tmp_child; - - local->call_count = 2; - - STACK_WIND (frame, unify_setattr_cbk, child, - child->fops->fsetattr, fd, stbuf, valid); - - STACK_WIND (frame, unify_setattr_cbk, NS(this), - NS(this)->fops->fsetattr, fd, stbuf, valid); - } else { - local->call_count = 1; - - STACK_WIND (frame, unify_setattr_cbk, - NS(this), NS(this)->fops->fsetattr, - fd, stbuf, valid); - } - - return 0; -} - - -/** - * unify_truncate_cbk - - */ -int32_t -unify_truncate_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *prebuf, - struct iatt *postbuf) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - prev_frame->this->name, - (local->loc1.path)?local->loc1.path:"", - strerror (op_errno)); - local->op_errno = op_errno; - if (!((op_errno == ENOENT) && priv->optimist)) - local->op_ret = -1; - } - - if (op_ret >= 0) { - if (NS (this) == prev_frame->this) { - local->ia_ino = postbuf->ia_ino; - /* If the entry is directory, get the - stat from NS node */ - if (IA_ISDIR (postbuf->ia_type) || - !local->stbuf.ia_blksize) { - local->stbuf = *prebuf; - local->poststbuf = *postbuf; - } - } - - if ((!IA_ISDIR (postbuf->ia_type)) && - (NS (this) != prev_frame->this)) { - /* If file, take the stat info from - Storage node. */ - local->stbuf = *prebuf; - local->poststbuf = *postbuf; - } - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->ia_ino) { - local->stbuf.ia_ino = local->ia_ino; - local->poststbuf.ia_ino = local->ia_ino; - } else { - local->op_ret = -1; - } - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stbuf, &local->poststbuf); - } - - return 0; -} - - -/** - * unify_truncate - - */ -int32_t -unify_truncate (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - off_t offset) -{ - unify_local_t *local = NULL; - unify_private_t *priv = this->private; - int32_t index = 0; - int32_t callcnt = 0; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - local->ia_ino = loc->inode->ino; - - if (IA_ISDIR (loc->inode->ia_type)) { - local->call_count = 1; - - STACK_WIND (frame, - unify_truncate_cbk, - NS(this), - NS(this)->fops->truncate, - loc, - 0); - } else { - local->op_ret = 0; - inode_ctx_get (loc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - - for (index = 0; local->list[index] != -1; index++) { - local->call_count++; - callcnt++; - } - - /* Don't send offset to NS truncate */ - STACK_WIND (frame, unify_truncate_cbk, NS(this), - NS(this)->fops->truncate, loc, 0); - callcnt--; - - for (index = 0; local->list[index] != -1; index++) { - if (NS(this) != priv->xl_array[local->list[index]]) { - STACK_WIND (frame, - unify_truncate_cbk, - priv->xl_array[local->list[index]], - priv->xl_array[local->list[index]]->fops->truncate, - loc, - offset); - if (!--callcnt) - break; - } - } - } - - return 0; -} - -/** - * unify_readlink_cbk - - */ -int32_t -unify_readlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - const char *path, - struct iatt *sbuf) -{ - STACK_UNWIND (frame, op_ret, op_errno, path, sbuf); - return 0; -} - -/** - * unify_readlink - Read the link only from the storage node. - */ -int32_t -unify_readlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - size_t size) -{ - unify_private_t *priv = this->private; - int32_t entry_count = 0; - int16_t *list = NULL; - int16_t index = 0; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) - entry_count++; - - if (entry_count >= 2) { - for (index = 0; list[index] != -1; index++) { - if (priv->xl_array[list[index]] != NS(this)) { - STACK_WIND (frame, - unify_readlink_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->readlink, - loc, - size); - break; - } - } - } else { - gf_log (this->name, GF_LOG_ERROR, - "returning ENOENT, no softlink files found " - "on storage node"); - STACK_UNWIND (frame, -1, ENOENT, NULL); - } - - return 0; -} - - -/** - * unify_unlink_cbk - - */ -int32_t -unify_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - int32_t callcnt = 0; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret == 0 || ((op_errno == ENOENT) && priv->optimist)) - local->op_ret = 0; - if (op_ret == -1) - local->op_errno = op_errno; - - if (((call_frame_t *)cookie)->this == NS(this)) { - local->oldpreparent = *preparent; - local->oldpostparent = *postparent; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->oldpreparent, &local->oldpostparent); - } - - return 0; -} - - -/** - * unify_unlink - - */ -int32_t -unify_unlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc) -{ - unify_private_t *priv = this->private; - unify_local_t *local = NULL; - int16_t *list = NULL; - int16_t index = 0; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) - local->call_count++; - - if (local->call_count) { - for (index = 0; list[index] != -1; index++) { - char need_break = (list[index+1] == -1); - STACK_WIND (frame, - unify_unlink_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->unlink, - loc); - if (need_break) - break; - } - } else { - gf_log (this->name, GF_LOG_ERROR, - "%s: returning ENOENT", loc->path); - STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); - } - - return 0; -} - - -/** - * unify_readv_cbk - - */ -int32_t -unify_readv_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iovec *vector, - int32_t count, - struct iatt *stbuf, - struct iobref *iobref) -{ - STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf, iobref); - return 0; -} - -/** - * unify_readv - - */ -int32_t -unify_readv (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t offset) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, - unify_readv_cbk, - child, - child->fops->readv, - fd, - size, - offset); - - - return 0; -} - -/** - * unify_writev_cbk - - */ -int32_t -unify_writev_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *prebuf, - struct iatt *postbuf) -{ - unify_local_t *local = NULL; - - local = frame->local; - - local->stbuf = *prebuf; - local->stbuf.ia_ino = local->ia_ino; - - local->poststbuf = *postbuf; - local->poststbuf.ia_ino = local->ia_ino; - - STACK_UNWIND (frame, op_ret, op_errno, - &local->stbuf, &local->poststbuf); - return 0; -} - -/** - * unify_writev - - */ -int32_t -unify_writev (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct iovec *vector, - int32_t count, - off_t off, - struct iobref *iobref) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - unify_local_t *local = NULL; - - INIT_LOCAL (frame, local); - local->ia_ino = fd->inode->ino; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, - unify_writev_cbk, - child, - child->fops->writev, - fd, - vector, - count, - off, - iobref); - - return 0; -} - -/** - * unify_ftruncate - - */ -int32_t -unify_ftruncate (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - off_t offset) -{ - xlator_t *child = NULL; - unify_local_t *local = NULL; - uint64_t tmp_child = 0; - - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR(fd); - - /* Initialization */ - INIT_LOCAL (frame, local); - local->op_ret = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - local->call_count = 2; - - STACK_WIND (frame, unify_truncate_cbk, - child, child->fops->ftruncate, - fd, offset); - - STACK_WIND (frame, unify_truncate_cbk, - NS(this), NS(this)->fops->ftruncate, - fd, 0); - - return 0; -} - - -/** - * unify_flush_cbk - - */ -int32_t -unify_flush_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -/** - * unify_flush - - */ -int32_t -unify_flush (call_frame_t *frame, - xlator_t *this, - fd_t *fd) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_flush_cbk, child, - child->fops->flush, fd); - - return 0; -} - - -/** - * unify_fsync_cbk - - */ -int32_t -unify_fsync_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *prebuf, - struct iatt *postbuf) -{ - STACK_UNWIND (frame, op_ret, op_errno, prebuf, postbuf); - return 0; -} - -/** - * unify_fsync - - */ -int32_t -unify_fsync (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_fsync_cbk, child, - child->fops->fsync, fd, flags); - - return 0; -} - -/** - * unify_fstat - Send fstat FOP to Namespace only if its directory, and to - * both namespace and the storage node if its a file. - */ -int32_t -unify_fstat (call_frame_t *frame, - xlator_t *this, - fd_t *fd) -{ - unify_local_t *local = NULL; - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd); - - INIT_LOCAL (frame, local); - local->ia_ino = fd->inode->ino; - - if (!fd_ctx_get (fd, this, &tmp_child)) { - /* If its set, then its file */ - child = (xlator_t *)(long)tmp_child; - local->call_count = 2; - - STACK_WIND (frame, unify_buf_cbk, child, - child->fops->fstat, fd); - - STACK_WIND (frame, unify_buf_cbk, NS(this), - NS(this)->fops->fstat, fd); - - } else { - /* this is an directory */ - local->call_count = 1; - STACK_WIND (frame, unify_buf_cbk, NS(this), - NS(this)->fops->fstat, fd); - } - - return 0; -} - -/** - * unify_getdents_cbk - - */ -int32_t -unify_getdents_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entry, - int32_t count) -{ - STACK_UNWIND (frame, op_ret, op_errno, entry, count); - return 0; -} - -/** - * unify_getdents - send the FOP request to all the nodes. - */ -int32_t -unify_getdents (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t offset, - int32_t flag) -{ - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd); - - STACK_WIND (frame, unify_getdents_cbk, NS(this), - NS(this)->fops->getdents, fd, size, offset, flag); - - return 0; -} - - -/** - * unify_readdir_cbk - - */ -int32_t -unify_readdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - gf_dirent_t *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - - return 0; -} - -/** - * unify_readdir - send the FOP request to all the nodes. - */ -int32_t -unify_readdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t offset) -{ - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd); - - STACK_WIND (frame, unify_readdir_cbk, NS(this), - NS(this)->fops->readdir, fd, size, offset); - - return 0; -} - - -int32_t -unify_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - - return 0; -} - - -int32_t -unify_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) -{ - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd); - - STACK_WIND (frame, unify_readdirp_cbk, NS(this), - NS(this)->fops->readdirp, fd, size, offset); - - return 0; -} - - -/** - * unify_fsyncdir_cbk - - */ -int32_t -unify_fsyncdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - - return 0; -} - -/** - * unify_fsyncdir - - */ -int32_t -unify_fsyncdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t flags) -{ - UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd); - - STACK_WIND (frame, unify_fsyncdir_cbk, - NS(this), NS(this)->fops->fsyncdir, fd, flags); - - return 0; -} - -/** - * unify_lk_cbk - UNWIND frame with the proper return arguments. - */ -int32_t -unify_lk_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct gf_flock *lock) -{ - STACK_UNWIND (frame, op_ret, op_errno, lock); - return 0; -} - -/** - * unify_lk - Send it to all the storage nodes, (should be 1) which has file. - */ -int32_t -unify_lk (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t cmd, - struct gf_flock *lock) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_lk_cbk, child, - child->fops->lk, fd, cmd, lock); - - return 0; -} - - -int32_t -unify_setxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno); - -static int32_t -unify_setxattr_file_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - unify_private_t *private = this->private; - unify_local_t *local = frame->local; - xlator_t *sched_xl = NULL; - struct sched_ops *sched_ops = NULL; - - if (op_ret == -1) { - if (!ENOTSUP) - gf_log (this->name, GF_LOG_ERROR, - "setxattr with XATTR_CREATE on ns: " - "path(%s) key(%s): %s", - local->loc1.path, local->name, - strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno); - return 0; - } - - LOCK (&frame->lock); - { - local->failed = 0; - local->op_ret = 0; - local->op_errno = 0; - local->call_count = 1; - } - UNLOCK (&frame->lock); - - /* schedule XATTR_CREATE on one of the child node */ - sched_ops = private->sched_ops; - - /* Send create request to the scheduled node now */ - sched_xl = sched_ops->schedule (this, local->name); - if (!sched_xl) { - STACK_UNWIND (frame, -1, ENOTCONN); - return 0; - } - - STACK_WIND (frame, - unify_setxattr_cbk, - sched_xl, - sched_xl->fops->setxattr, - &local->loc1, - local->dict, - local->flags); - return 0; -} - -/** - * unify_setxattr_cbk - When all the child nodes return, UNWIND frame. - */ -int32_t -unify_setxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = 0; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - dict_t *dict = NULL; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - gf_log (this->name, (((op_errno == ENOENT) || - (op_errno == ENOTSUP))? - GF_LOG_DEBUG : GF_LOG_ERROR), - "child(%s): path(%s): %s", - prev_frame->this->name, - (local->loc1.path)?local->loc1.path:"", - strerror (op_errno)); - if (local->failed == -1) { - local->failed = 1; - } - local->op_errno = op_errno; - } else { - local->failed = 0; - local->op_ret = op_ret; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - if (local->failed && local->name && - ZR_FILE_CONTENT_REQUEST(local->name)) { - dict = get_new_dict (); - dict_set (dict, local->dict->members_list->key, - data_from_dynptr(NULL, 0)); - dict_ref (dict); - - local->call_count = 1; - - STACK_WIND (frame, - unify_setxattr_file_cbk, - NS(this), - NS(this)->fops->setxattr, - &local->loc1, - dict, - XATTR_CREATE); - - dict_unref (dict); - return 0; - } - - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno); - } - - return 0; -} - -/** - * unify_sexattr - This function should be sent to all the storage nodes, - * which contains the file, (excluding namespace). - */ -int32_t -unify_setxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *dict, - int32_t flags) -{ - unify_private_t *priv = this->private; - unify_local_t *local = NULL; - int16_t *list = NULL; - int16_t index = 0; - int32_t call_count = 0; - uint64_t tmp_list = 0; - data_pair_t *trav = dict->members_list; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - local->failed = -1; - loc_copy (&local->loc1, loc); - - if (IA_ISDIR (loc->inode->ia_type)) { - - if (trav && trav->key && ZR_FILE_CONTENT_REQUEST(trav->key)) { - /* direct the storage xlators to change file - content only if file exists */ - local->flags = flags; - local->dict = dict; - local->name = gf_strdup (trav->key); - flags |= XATTR_REPLACE; - } - - local->call_count = priv->child_count; - for (index = 0; index < priv->child_count; index++) { - STACK_WIND (frame, - unify_setxattr_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->setxattr, - loc, dict, flags); - } - return 0; - } - - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) { - if (NS(this) != priv->xl_array[list[index]]) { - local->call_count++; - call_count++; - } - } - - if (local->call_count) { - for (index = 0; list[index] != -1; index++) { - if (priv->xl_array[list[index]] != NS(this)) { - STACK_WIND (frame, - unify_setxattr_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->setxattr, - loc, - dict, - flags); - if (!--call_count) - break; - } - } - return 0; - } - - /* No entry in storage nodes */ - gf_log (this->name, GF_LOG_DEBUG, - "returning ENOENT, file not found on storage node."); - STACK_UNWIND (frame, -1, ENOENT); - - return 0; -} - - -/** - * unify_getxattr_cbk - This function is called from only one child, so, no - * need of any lock or anything else, just send it to above layer - */ -int32_t -unify_getxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dict_t *value) -{ - int32_t callcnt = 0; - dict_t *local_value = NULL; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, - (((op_errno == ENOENT) || - (op_errno == ENODATA) || - (op_errno == ENOTSUP)) ? - GF_LOG_DEBUG : GF_LOG_ERROR), - "child(%s): path(%s): %s", - prev_frame->this->name, - (local->loc1.path)?local->loc1.path:"", - strerror (op_errno)); - } else { - if (!local->dict) - local->dict = dict_ref (value); - local->op_ret = op_ret; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local_value = local->dict; - local->dict = NULL; - - STACK_UNWIND (frame, local->op_ret, local->op_errno, - local_value); - - if (local_value) - dict_unref (local_value); - } - - return 0; -} - - -/** - * unify_getxattr - This FOP is sent to only the storage node. - */ -int32_t -unify_getxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name) -{ - unify_private_t *priv = this->private; - int16_t *list = NULL; - int16_t index = 0; - int16_t count = 0; - unify_local_t *local = NULL; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - INIT_LOCAL (frame, local); - - if (IA_ISDIR (loc->inode->ia_type)) { - local->call_count = priv->child_count; - for (index = 0; index < priv->child_count; index++) - STACK_WIND (frame, - unify_getxattr_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->getxattr, - loc, - name); - return 0; - } - - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) { - if (NS(this) != priv->xl_array[list[index]]) { - local->call_count++; - count++; - } - } - - if (count) { - for (index = 0; list[index] != -1; index++) { - if (priv->xl_array[list[index]] != NS(this)) { - STACK_WIND (frame, - unify_getxattr_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->getxattr, - loc, - name); - if (!--count) - break; - } - } - } else { - dict_t *tmp_dict = get_new_dict (); - gf_log (this->name, GF_LOG_DEBUG, - "%s: returning ENODATA, no file found on storage node", - loc->path); - STACK_UNWIND (frame, -1, ENODATA, tmp_dict); - dict_destroy (tmp_dict); - } - - return 0; -} - -/** - * unify_removexattr_cbk - Wait till all the child node returns the call - * and then UNWIND to above layer. - */ -int32_t -unify_removexattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - int32_t callcnt = 0; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret == -1) { - local->op_errno = op_errno; - if (op_errno != ENOTSUP) - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - prev_frame->this->name, - local->loc1.path, strerror (op_errno)); - } else { - local->op_ret = op_ret; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - STACK_UNWIND (frame, local->op_ret, local->op_errno); - } - - return 0; -} - -/** - * unify_removexattr - Send it to all the child nodes which has the files. - */ -int32_t -unify_removexattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name) -{ - unify_private_t *priv = this->private; - unify_local_t *local = NULL; - int16_t *list = NULL; - int16_t index = 0; - int32_t call_count = 0; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); - - /* Initialization */ - INIT_LOCAL (frame, local); - - if (IA_ISDIR (loc->inode->ia_type)) { - local->call_count = priv->child_count; - for (index = 0; index < priv->child_count; index++) - STACK_WIND (frame, - unify_removexattr_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->removexattr, - loc, - name); - - return 0; - } - - inode_ctx_get (loc->inode, this, &tmp_list); - list = (int16_t *)(long)tmp_list; - - for (index = 0; list[index] != -1; index++) { - if (NS(this) != priv->xl_array[list[index]]) { - local->call_count++; - call_count++; - } - } - - if (local->call_count) { - for (index = 0; list[index] != -1; index++) { - if (priv->xl_array[list[index]] != NS(this)) { - STACK_WIND (frame, - unify_removexattr_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->removexattr, - loc, - name); - if (!--call_count) - break; - } - } - return 0; - } - - gf_log (this->name, GF_LOG_DEBUG, - "%s: returning ENOENT, not found on storage node.", loc->path); - STACK_UNWIND (frame, -1, ENOENT); - - return 0; -} - - -int32_t -unify_mknod_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_local_t *local = frame->local; - - if (op_ret == -1) - gf_log (this->name, GF_LOG_ERROR, - "%s: %s", local->loc1.path, strerror (op_errno)); - - unify_local_wipe (local); - /* No log required here as this -1 is for mknod call */ - STACK_UNWIND (frame, -1, local->op_errno, NULL, NULL); - return 0; -} - -/** - * unify_mknod_cbk - - */ -int32_t -unify_mknod_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_local_t *local = frame->local; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "mknod failed on storage node, sending unlink to " - "namespace"); - local->op_errno = op_errno; - STACK_WIND (frame, - unify_mknod_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - return 0; - } - - local->stbuf = *buf; - local->stbuf.ia_ino = local->ia_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf, - &local->oldpreparent, &local->oldpostparent); - return 0; -} - -/** - * unify_ns_mknod_cbk - - */ -int32_t -unify_ns_mknod_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - struct sched_ops *sched_ops = NULL; - xlator_t *sched_xl = NULL; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t *list = NULL; - int16_t index = 0; - call_frame_t *prev_frame = cookie; - - if (op_ret == -1) { - /* No need to send mknod request to other servers, - * as namespace action failed - */ - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s): %s", - prev_frame->this->name, local->loc1.path, - strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, buf, - preparent, postparent); - return 0; - } - - /* Create one inode for this entry */ - local->op_ret = 0; - local->stbuf = *buf; - local->ia_ino = buf->ia_ino; - - local->oldpreparent = *preparent; - local->oldpostparent = *postparent; - - list = GF_CALLOC (1, sizeof (int16_t) * 3, gf_unify_mt_int16_t); - ERR_ABORT (list); - list[0] = priv->child_count; - list[2] = -1; - inode_ctx_put (inode, this, (uint64_t)(long)list); - - sched_ops = priv->sched_ops; - - /* Send mknod request to scheduled node now */ - sched_xl = sched_ops->schedule (this, local->loc1.path); - if (!sched_xl) { - gf_log (this->name, GF_LOG_ERROR, - "mknod failed on storage node, no node online " - "at the moment, sending unlink to NS"); - local->op_errno = ENOTCONN; - STACK_WIND (frame, - unify_mknod_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - - return 0; - } - - for (index = 0; index < priv->child_count; index++) - if (sched_xl == priv->xl_array[index]) - break; - list[1] = index; - - STACK_WIND (frame, unify_mknod_cbk, - sched_xl, sched_xl->fops->mknod, - &local->loc1, local->mode, local->dev); - - return 0; -} - -/** - * unify_mknod - Create a device on namespace first, and later create on - * the storage node. - */ -int32_t -unify_mknod (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - mode_t mode, - dev_t rdev) -{ - unify_local_t *local = NULL; - - /* Initialization */ - INIT_LOCAL (frame, local); - local->mode = mode; - local->dev = rdev; - loc_copy (&local->loc1, loc); - if (local->loc1.path == NULL) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL); - return 0; - } - - STACK_WIND (frame, - unify_ns_mknod_cbk, - NS(this), - NS(this)->fops->mknod, - loc, - mode, - rdev); - - return 0; -} - -int32_t -unify_symlink_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_local_t *local = frame->local; - if (op_ret == -1) - gf_log (this->name, GF_LOG_ERROR, - "%s: %s", local->loc1.path, strerror (op_errno)); - - unify_local_wipe (local); - STACK_UNWIND (frame, -1, local->op_errno, NULL, NULL); - return 0; -} - -/** - * unify_symlink_cbk - - */ -int32_t -unify_symlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_local_t *local = frame->local; - - if (op_ret == -1) { - /* Symlink on storage node failed, hence send unlink - to the NS node */ - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_ERROR, - "symlink on storage node failed, sending unlink " - "to namespace"); - - STACK_WIND (frame, - unify_symlink_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - - return 0; - } - - local->stbuf = *buf; - local->stbuf.ia_ino = local->ia_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf, - &local->oldpreparent, &local->oldpostparent); - - return 0; -} - -/** - * unify_ns_symlink_cbk - - */ -int32_t -unify_ns_symlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - - struct sched_ops *sched_ops = NULL; - xlator_t *sched_xl = NULL; - int16_t *list = NULL; - unify_local_t *local = frame->local; - unify_private_t *priv = this->private; - int16_t index = 0; - - if (op_ret == -1) { - /* No need to send symlink request to other servers, - * as namespace action failed - */ - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s): %s", - local->loc1.path, strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, NULL, buf, - preparent, postparent); - return 0; - } - - /* Create one inode for this entry */ - local->op_ret = 0; - local->ia_ino = buf->ia_ino; - - local->oldpreparent = *preparent; - local->oldpostparent = *postparent; - - /* Start the mapping list */ - - list = GF_CALLOC (1, sizeof (int16_t) * 3, gf_unify_mt_int16_t); - ERR_ABORT (list); - list[0] = priv->child_count; //namespace's index - list[2] = -1; - inode_ctx_put (inode, this, (uint64_t)(long)list); - - sched_ops = priv->sched_ops; - - /* Send symlink request to all the nodes now */ - sched_xl = sched_ops->schedule (this, local->loc1.path); - if (!sched_xl) { - /* Symlink on storage node failed, hence send unlink - to the NS node */ - local->op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_ERROR, - "symlink on storage node failed, no node online, " - "sending unlink to namespace"); - - STACK_WIND (frame, - unify_symlink_unlink_cbk, - NS(this), - NS(this)->fops->unlink, - &local->loc1); - - return 0; - } - - for (index = 0; index < priv->child_count; index++) - if (sched_xl == priv->xl_array[index]) - break; - list[1] = index; - - STACK_WIND (frame, - unify_symlink_cbk, - sched_xl, - sched_xl->fops->symlink, - local->name, - &local->loc1); - - return 0; -} - -/** - * unify_symlink - - */ -int32_t -unify_symlink (call_frame_t *frame, - xlator_t *this, - const char *linkpath, - loc_t *loc) -{ - unify_local_t *local = NULL; - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, loc); - local->name = gf_strdup (linkpath); - - if ((local->name == NULL) || - (local->loc1.path == NULL)) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL); - return 0; - } - - STACK_WIND (frame, - unify_ns_symlink_cbk, - NS(this), - NS(this)->fops->symlink, - linkpath, - loc); - - return 0; -} - - -int32_t -unify_rename_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - int32_t callcnt = 0; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s -> %s): %s", - prev_frame->this->name, - local->loc1.path, local->loc2.path, - strerror (op_errno)); - - } - LOCK (&frame->lock); - { - callcnt = --local->call_count; - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local->stbuf.ia_ino = local->ia_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stbuf); - } - return 0; -} - -int32_t -unify_ns_rename_undo_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf, - struct iatt *preoldparent, - struct iatt *postoldparent, - struct iatt *prenewparent, - struct iatt *postnewparent) -{ - unify_local_t *local = frame->local; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s -> %s): %s", - local->loc1.path, local->loc2.path, - strerror (op_errno)); - } - - local->stbuf.ia_ino = local->ia_ino; - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, &local->stbuf); - return 0; -} - -int32_t -unify_rename_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf, - struct iatt *preoldparent, - struct iatt *postoldparent, - struct iatt *prenewparent, - struct iatt *postnewparent) -{ - int32_t index = 0; - int32_t callcnt = 0; - int16_t *list = NULL; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - call_frame_t *prev_frame = cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret >= 0) { - if (!IA_ISDIR (buf->ia_type)) - local->stbuf = *buf; - local->op_ret = op_ret; - } else { - gf_log (this->name, GF_LOG_ERROR, - "child(%s): path(%s -> %s): %s", - prev_frame->this->name, - local->loc1.path, local->loc2.path, - strerror (op_errno)); - local->op_errno = op_errno; - } - } - UNLOCK (&frame->lock); - - if (!callcnt) { - local->stbuf.ia_ino = local->ia_ino; - if (IA_ISDIR (local->loc1.inode->ia_type)) { - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, local->op_errno, - &local->stbuf, &local->oldpreparent, - &local->oldpostparent, &local->newpreparent, - &local->newpostparent); - return 0; - } - - if (local->op_ret == -1) { - /* TODO: check this logic */ - - /* Rename failed in storage node, successful on NS, - * hence, rename back the entries in NS */ - /* NOTE: this will be done only if the destination - * doesn't exists, if the destination exists, the - * job of correcting NS is left to self-heal - */ - if (!local->index) { - loc_t tmp_oldloc = { - /* its actual 'newloc->path' */ - .path = local->loc2.path, - .inode = local->loc1.inode, - .parent = local->loc2.parent - }; - - loc_t tmp_newloc = { - /* Actual 'oldloc->path' */ - .path = local->loc1.path, - .parent = local->loc1.parent - }; - - gf_log (this->name, GF_LOG_ERROR, - "rename succussful on namespace, on " - "stroage node failed, reverting back"); - - STACK_WIND (frame, - unify_ns_rename_undo_cbk, - NS(this), - NS(this)->fops->rename, - &tmp_oldloc, - &tmp_newloc); - return 0; - } - } else { - /* Rename successful on storage nodes */ - - int32_t idx = 0; - int16_t *tmp_list = NULL; - uint64_t tmp_list_int64 = 0; - if (local->loc2.inode) { - inode_ctx_get (local->loc2.inode, - this, &tmp_list_int64); - list = (int16_t *)(long)tmp_list_int64; - - } - - if (list) { - for (index = 0; list[index] != -1; index++); - tmp_list = GF_CALLOC (1, index * 2, - gf_unify_mt_int16_t); - memcpy (tmp_list, list, index * 2); - - for (index = 0; list[index] != -1; index++) { - /* TODO: Check this logic. */ - /* If the destination file exists in - * the same storage node where we sent - * 'rename' call, no need to send - * unlink - */ - for (idx = 0; - local->list[idx] != -1; idx++) { - if (tmp_list[index] == local->list[idx]) { - tmp_list[index] = priv->child_count; - continue; - } - } - - if (NS(this) != priv->xl_array[tmp_list[index]]) { - local->call_count++; - callcnt++; - } - } - - if (local->call_count) { - if (callcnt > 1) - gf_log (this->name, - GF_LOG_ERROR, - "%s->%s: more (%d) " - "subvolumes have the " - "newloc entry", - local->loc1.path, - local->loc2.path, - callcnt); - - for (index=0; - tmp_list[index] != -1; index++) { - if (NS(this) != priv->xl_array[tmp_list[index]]) { - STACK_WIND (frame, - unify_rename_unlink_cbk, - priv->xl_array[tmp_list[index]], - priv->xl_array[tmp_list[index]]->fops->unlink, - &local->loc2); - if (!--callcnt) - break; - } - } - - GF_FREE (tmp_list); - return 0; - } - if (tmp_list) - GF_FREE (tmp_list); - } - } - - /* Need not send 'unlink' to storage node */ - unify_local_wipe (local); - STACK_UNWIND (frame, local->op_ret, - local->op_errno, &local->stbuf, - &local->oldpreparent, &local->oldpostparent, - &local->newpreparent, &local->newpostparent); - } - - return 0; -} - -int32_t -unify_ns_rename_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct iatt *buf, - struct iatt *preoldparent, - struct iatt *postoldparent, - struct iatt *prenewparent, - struct iatt *postnewparent) -{ - int32_t index = 0; - int32_t callcnt = 0; - int16_t *list = NULL; - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - - if (op_ret == -1) { - /* Free local->new_inode */ - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s -> %s): %s", - local->loc1.path, local->loc2.path, - strerror (op_errno)); - - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, buf, - preoldparent, postoldparent, - prenewparent, postnewparent); - return 0; - } - - local->stbuf = *buf; - local->ia_ino = buf->ia_ino; - - local->oldpreparent = *preoldparent; - local->oldpostparent = *postoldparent; - local->newpreparent = *prenewparent; - local->newpostparent = *postnewparent; - - /* Everything is fine. */ - if (IA_ISDIR (buf->ia_type)) { - local->call_count = priv->child_count; - for (index=0; index < priv->child_count; index++) { - STACK_WIND (frame, - unify_rename_cbk, - priv->xl_array[index], - priv->xl_array[index]->fops->rename, - &local->loc1, - &local->loc2); - } - - return 0; - } - - local->call_count = 0; - /* send rename */ - list = local->list; - for (index=0; list[index] != -1; index++) { - if (NS(this) != priv->xl_array[list[index]]) { - local->call_count++; - callcnt++; - } - } - - if (local->call_count) { - for (index=0; list[index] != -1; index++) { - if (NS(this) != priv->xl_array[list[index]]) { - STACK_WIND (frame, - unify_rename_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->rename, - &local->loc1, - &local->loc2); - if (!--callcnt) - break; - } - } - } else { - /* file doesn't seem to be present in storage nodes */ - gf_log (this->name, GF_LOG_CRITICAL, - "CRITICAL: source file not in storage node, " - "rename successful on namespace :O"); - unify_local_wipe (local); - STACK_UNWIND (frame, -1, EIO, NULL, - NULL, NULL, /* preoldparent, postoldparent */ - NULL, NULL); /* prenewparent, postnewparent */ - } - return 0; -} - - -/** - * unify_rename - One of the tricky function. The deadliest of all :O - */ -int32_t -unify_rename (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - unify_local_t *local = NULL; - uint64_t tmp_list = 0; - - /* Initialization */ - INIT_LOCAL (frame, local); - loc_copy (&local->loc1, oldloc); - loc_copy (&local->loc2, newloc); - - if ((local->loc1.path == NULL) || - (local->loc2.path == NULL)) { - gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); - STACK_UNWIND (frame, -1, ENOMEM, NULL, - NULL, NULL, /* preoldparent, postoldparent */ - NULL, NULL); /* prenewparent, postnewparent */ - return 0; - } - - inode_ctx_get (oldloc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - - STACK_WIND (frame, - unify_ns_rename_cbk, - NS(this), - NS(this)->fops->rename, - oldloc, - newloc); - return 0; -} - -/** - * unify_link_cbk - - */ -int32_t -unify_link_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_local_t *local = frame->local; - - if (op_ret >= 0) - local->stbuf = *buf; - local->stbuf.ia_ino = local->ia_ino; - - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf, - &local->oldpreparent, &local->oldpostparent); - - return 0; -} - -/** - * unify_ns_link_cbk - - */ -int32_t -unify_ns_link_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - inode_t *inode, - struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - unify_private_t *priv = this->private; - unify_local_t *local = frame->local; - int16_t *list = local->list; - int16_t index = 0; - - if (op_ret == -1) { - /* No need to send link request to other servers, - * as namespace action failed - */ - gf_log (this->name, GF_LOG_ERROR, - "namespace: path(%s -> %s): %s", - local->loc1.path, local->loc2.path, - strerror (op_errno)); - unify_local_wipe (local); - STACK_UNWIND (frame, op_ret, op_errno, inode, buf, - preparent, postparent); - return 0; - } - - /* Update inode for this entry */ - local->op_ret = 0; - local->ia_ino = buf->ia_ino; - - local->oldpreparent = *preparent; - local->oldpostparent = *postparent; - - /* Send link request to the node now */ - for (index = 0; list[index] != -1; index++) { - char need_break = (list[index+1] == -1); - if (priv->xl_array[list[index]] != NS (this)) { - STACK_WIND (frame, - unify_link_cbk, - priv->xl_array[list[index]], - priv->xl_array[list[index]]->fops->link, - &local->loc1, - &local->loc2); - break; - } - if (need_break) - break; - } - - return 0; -} - -/** - * unify_link - - */ -int32_t -unify_link (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc) -{ - unify_local_t *local = NULL; - uint64_t tmp_list = 0; - - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (oldloc); - UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (newloc); - - /* Initialization */ - INIT_LOCAL (frame, local); - - loc_copy (&local->loc1, oldloc); - loc_copy (&local->loc2, newloc); - - inode_ctx_get (oldloc->inode, this, &tmp_list); - local->list = (int16_t *)(long)tmp_list; - - STACK_WIND (frame, - unify_ns_link_cbk, - NS(this), - NS(this)->fops->link, - oldloc, - newloc); - - return 0; -} - - -/** - * unify_checksum_cbk - - */ -int32_t -unify_checksum_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - uint8_t *fchecksum, - uint8_t *dchecksum) -{ - STACK_UNWIND (frame, op_ret, op_errno, fchecksum, dchecksum); - - return 0; -} - -/** - * unify_checksum - - */ -int32_t -unify_checksum (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flag) -{ - STACK_WIND (frame, - unify_checksum_cbk, - NS(this), - NS(this)->fops->checksum, - loc, - flag); - - return 0; -} - - -/** - * unify_finodelk_cbk - - */ -int -unify_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -/** - * unify_finodelk - */ -int -unify_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int cmd, struct gf_flock *flock) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_finodelk_cbk, - child, child->fops->finodelk, - volume, fd, cmd, flock); - - return 0; -} - - - -/** - * unify_fentrylk_cbk - - */ -int -unify_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -/** - * unify_fentrylk - */ -int -unify_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type) - -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_fentrylk_cbk, - child, child->fops->fentrylk, - volume, fd, basename, cmd, type); - - return 0; -} - - - -/** - * unify_fxattrop_cbk - - */ -int -unify_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) -{ - STACK_UNWIND (frame, op_ret, op_errno, xattr); - return 0; -} - -/** - * unify_fxattrop - */ -int -unify_fxattrop (call_frame_t *frame, xlator_t *this, - fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr) -{ - UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); - xlator_t *child = NULL; - uint64_t tmp_child = 0; - - fd_ctx_get (fd, this, &tmp_child); - child = (xlator_t *)(long)tmp_child; - - STACK_WIND (frame, unify_fxattrop_cbk, - child, child->fops->fxattrop, - fd, optype, xattr); - - return 0; -} - - -/** - * unify_inodelk_cbk - - */ -int -unify_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - - -/** - * unify_inodelk - */ -int -unify_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int cmd, struct gf_flock *flock) -{ - xlator_t *child = NULL; - - child = unify_loc_subvol (loc, this); - - STACK_WIND (frame, unify_inodelk_cbk, - child, child->fops->inodelk, - volume, loc, cmd, flock); - - return 0; -} - - - -/** - * unify_entrylk_cbk - - */ -int -unify_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -/** - * unify_entrylk - */ -int -unify_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) - -{ - xlator_t *child = NULL; - - child = unify_loc_subvol (loc, this); - - STACK_WIND (frame, unify_entrylk_cbk, - child, child->fops->entrylk, - volume, loc, basename, cmd, type); - - return 0; -} - - - -/** - * unify_xattrop_cbk - - */ -int -unify_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) -{ - STACK_UNWIND (frame, op_ret, op_errno, xattr); - return 0; -} - -/** - * unify_xattrop - */ -int -unify_xattrop (call_frame_t *frame, xlator_t *this, - loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr) -{ - xlator_t *child = NULL; - - child = unify_loc_subvol (loc, this); - - STACK_WIND (frame, unify_xattrop_cbk, - child, child->fops->xattrop, - loc, optype, xattr); - - return 0; -} - -int -unify_forget (xlator_t *this, - inode_t *inode) -{ - int16_t *list = NULL; - uint64_t tmp_list = 0; - - if (inode->ia_type && (!IA_ISDIR(inode->ia_type))) { - inode_ctx_get (inode, this, &tmp_list); - if (tmp_list) { - list = (int16_t *)(long)tmp_list; - GF_FREE (list); - } - } - - return 0; -} - -/** - * notify - */ -int32_t -notify (xlator_t *this, - int32_t event, - void *data, - ...) -{ - unify_private_t *priv = this->private; - struct sched_ops *sched = NULL; - - if (!priv) { - return 0; - } - - sched = priv->sched_ops; - if (!sched) { - gf_log (this->name, GF_LOG_CRITICAL, "No scheduler :O"); - raise (SIGTERM); - return 0; - } - if (priv->namespace == data) { - if (event == GF_EVENT_CHILD_UP) { - sched->notify (this, event, data); - } - return 0; - } - - switch (event) - { - case GF_EVENT_CHILD_UP: - { - /* Call scheduler's update () to enable it for scheduling */ - sched->notify (this, event, data); - - LOCK (&priv->lock); - { - /* Increment the inode's generation, which is - used for self_heal */ - ++priv->inode_generation; - ++priv->num_child_up; - } - UNLOCK (&priv->lock); - - if (!priv->is_up) { - default_notify (this, event, data); - priv->is_up = 1; - } - } - break; - case GF_EVENT_CHILD_DOWN: - { - /* Call scheduler's update () to disable the child node - * for scheduling - */ - sched->notify (this, event, data); - LOCK (&priv->lock); - { - --priv->num_child_up; - } - UNLOCK (&priv->lock); - - if (priv->num_child_up == 0) { - /* Send CHILD_DOWN to upper layer */ - default_notify (this, event, data); - priv->is_up = 0; - } - } - break; - - default: - { - default_notify (this, event, data); - } - break; - } - - return 0; -} - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - if (!this) - return ret; - - ret = xlator_mem_acct_init (this, gf_unify_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } - - return ret; -} - -/** - * init - This function is called first in the xlator, while initializing. - * All the config file options are checked and appropriate flags are set. - * - * @this - - */ -int32_t -init (xlator_t *this) -{ - int32_t ret = 0; - int32_t count = 0; - data_t *scheduler = NULL; - data_t *data = NULL; - xlator_t *ns_xl = NULL; - xlator_list_t *trav = NULL; - xlator_list_t *xlparent = NULL; - xlator_list_t *parent = NULL; - unify_private_t *_private = NULL; - - - /* Check for number of child nodes, if there is no child nodes, exit */ - if (!this->children) { - gf_log (this->name, GF_LOG_ERROR, - "No child nodes specified. check \"subvolumes \" " - "option in volfile"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - /* Check for 'scheduler' in volume */ - scheduler = dict_get (this->options, "scheduler"); - if (!scheduler) { - gf_log (this->name, GF_LOG_ERROR, - "\"option scheduler <x>\" is missing in volfile"); - return -1; - } - - /* Setting "option namespace <node>" */ - data = dict_get (this->options, "namespace"); - if(!data) { - gf_log (this->name, GF_LOG_CRITICAL, - "namespace option not specified, Exiting"); - return -1; - } - /* Search namespace in the child node, if found, exit */ - trav = this->children; - while (trav) { - if (strcmp (trav->xlator->name, data->data) == 0) - break; - trav = trav->next; - } - if (trav) { - gf_log (this->name, GF_LOG_CRITICAL, - "namespace node used as a subvolume, Exiting"); - return -1; - } - - /* Search for the namespace node, if found, continue */ - ns_xl = this->next; - while (ns_xl) { - if (strcmp (ns_xl->name, data->data) == 0) - break; - ns_xl = ns_xl->next; - } - if (!ns_xl) { - gf_log (this->name, GF_LOG_CRITICAL, - "namespace node not found in volfile, Exiting"); - return -1; - } - - gf_log (this->name, GF_LOG_DEBUG, - "namespace node specified as %s", data->data); - - _private = GF_CALLOC (1, sizeof (*_private), - gf_unify_mt_unify_private_t); - ERR_ABORT (_private); - _private->sched_ops = get_scheduler (this, scheduler->data); - if (!_private->sched_ops) { - gf_log (this->name, GF_LOG_CRITICAL, - "Error while loading scheduler. Exiting"); - GF_FREE (_private); - return -1; - } - - if (ns_xl->parents) { - gf_log (this->name, GF_LOG_CRITICAL, - "Namespace node should not be a child of any other node. Exiting"); - GF_FREE (_private); - return -1; - } - - _private->namespace = ns_xl; - - /* update _private structure */ - { - count = 0; - trav = this->children; - /* Get the number of child count */ - while (trav) { - count++; - trav = trav->next; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Child node count is %d", count); - - _private->child_count = count; - if (count == 1) { - /* TODO: Should I error out here? */ - gf_log (this->name, GF_LOG_CRITICAL, - "WARNING: You have defined only one " - "\"subvolumes\" for unify volume. It may not " - "be the desired config, review your volume " - "volfile. If this is how you are testing it," - " you may hit some performance penalty"); - } - - _private->xl_array = GF_CALLOC (1, - sizeof (xlator_t) * (count + 1), - gf_unify_mt_xlator_t); - ERR_ABORT (_private->xl_array); - - count = 0; - trav = this->children; - while (trav) { - _private->xl_array[count++] = trav->xlator; - trav = trav->next; - } - _private->xl_array[count] = _private->namespace; - - /* self-heal part, start with generation '1' */ - _private->inode_generation = 1; - /* Because, Foreground part is tested well */ - _private->self_heal = ZR_UNIFY_FG_SELF_HEAL; - data = dict_get (this->options, "self-heal"); - if (data) { - if (strcasecmp (data->data, "off") == 0) - _private->self_heal = ZR_UNIFY_SELF_HEAL_OFF; - - if (strcasecmp (data->data, "foreground") == 0) - _private->self_heal = ZR_UNIFY_FG_SELF_HEAL; - - if (strcasecmp (data->data, "background") == 0) - _private->self_heal = ZR_UNIFY_BG_SELF_HEAL; - } - - /* optimist - ask bulde for more about it */ - data = dict_get (this->options, "optimist"); - if (data) { - if (gf_string2boolean (data->data, - &_private->optimist) == -1) { - gf_log (this->name, GF_LOG_ERROR, - "optimist excepts only boolean " - "options"); - } - } - - LOCK_INIT (&_private->lock); - } - - /* Now that everything is fine. */ - this->private = (void *)_private; - { - ret = _private->sched_ops->mem_acct_init (this); - - if (ret == -1) { - return -1; - } - - /* Initialize scheduler, if everything else is successful */ - ret = _private->sched_ops->init (this); - if (ret == -1) { - gf_log (this->name, GF_LOG_CRITICAL, - "Initializing scheduler failed, Exiting"); - GF_FREE (_private); - return -1; - } - - - ret = 0; - - /* This section is required because some fops may look - * for 'xl->parent' variable - */ - xlparent = GF_CALLOC (1, sizeof (*xlparent), - gf_unify_mt_xlator_list_t); - xlparent->xlator = this; - if (!ns_xl->parents) { - ns_xl->parents = xlparent; - } else { - parent = ns_xl->parents; - while (parent->next) - parent = parent->next; - parent->next = xlparent; - } - } - - /* Tell namespace node that init is done */ - xlator_notify (ns_xl, GF_EVENT_PARENT_UP, this); - - return 0; -} - -/** - * fini - Free all the allocated memory - */ -void -fini (xlator_t *this) -{ - unify_private_t *priv = this->private; - priv->sched_ops->fini (this); - this->private = NULL; - LOCK_DESTROY (&priv->lock); - GF_FREE (priv->xl_array); - GF_FREE (priv); - return; -} - - -struct xlator_fops fops = { - .stat = unify_stat, - .readlink = unify_readlink, - .mknod = unify_mknod, - .mkdir = unify_mkdir, - .unlink = unify_unlink, - .rmdir = unify_rmdir, - .symlink = unify_symlink, - .rename = unify_rename, - .link = unify_link, - .truncate = unify_truncate, - .create = unify_create, - .open = unify_open, - .readv = unify_readv, - .writev = unify_writev, - .statfs = unify_statfs, - .flush = unify_flush, - .fsync = unify_fsync, - .setxattr = unify_setxattr, - .getxattr = unify_getxattr, - .removexattr = unify_removexattr, - .opendir = unify_opendir, - .readdir = unify_readdir, - .readdirp = unify_readdirp, - .fsyncdir = unify_fsyncdir, - .access = unify_access, - .ftruncate = unify_ftruncate, - .fstat = unify_fstat, - .lk = unify_lk, - .lookup = unify_lookup, - .getdents = unify_getdents, - .checksum = unify_checksum, - .inodelk = unify_inodelk, - .finodelk = unify_finodelk, - .entrylk = unify_entrylk, - .fentrylk = unify_fentrylk, - .xattrop = unify_xattrop, - .fxattrop = unify_fxattrop, - .setattr = unify_setattr, - .fsetattr = unify_fsetattr, -}; - - -struct xlator_cbks cbks = { - .forget = unify_forget, -}; - -struct volume_options options[] = { - { .key = { "namespace" }, - .type = GF_OPTION_TYPE_XLATOR - }, - { .key = { "scheduler" }, - .value = { "alu", "rr", "random", "nufa", "switch" }, - .type = GF_OPTION_TYPE_STR - }, - { .key = {"self-heal"}, - .value = { "foreground", "background", "off" }, - .type = GF_OPTION_TYPE_STR - }, - /* TODO: remove it some time later */ - { .key = {"optimist"}, - .type = GF_OPTION_TYPE_BOOL - }, - - { .key = {NULL} }, -}; diff --git a/xlators/cluster/unify/src/unify.h b/xlators/cluster/unify/src/unify.h deleted file mode 100644 index dbd5e44a2..000000000 --- a/xlators/cluster/unify/src/unify.h +++ /dev/null @@ -1,146 +0,0 @@ -/* - Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#ifndef _UNIFY_H -#define _UNIFY_H - -#include "scheduler.h" -#include "list.h" -#include "unify-mem-types.h" - -#define MAX_DIR_ENTRY_STRING (32 * 1024) - -#define ZR_UNIFY_SELF_HEAL_OFF 0 -#define ZR_UNIFY_FG_SELF_HEAL 1 -#define ZR_UNIFY_BG_SELF_HEAL 2 - -/* Sometimes one should use completely random numbers.. its good :p */ -#define UNIFY_SELF_HEAL_GETDENTS_COUNT 512 - -#define NS(xl) (((unify_private_t *)xl->private)->namespace) - -/* This is used to allocate memory for local structure */ -#define INIT_LOCAL(fr, loc) \ -do { \ - loc = GF_CALLOC (1, sizeof (unify_local_t), gf_unify_mt_unify_local_t); \ - ERR_ABORT (loc); \ - if (!loc) { \ - STACK_UNWIND (fr, -1, ENOMEM); \ - return 0; \ - } \ - fr->local = loc; \ - loc->op_ret = -1; \ - loc->op_errno = ENOENT; \ -} while (0) - - - -struct unify_private { - /* Update this structure depending on requirement */ - void *scheduler; /* THIS SHOULD BE THE FIRST VARIABLE, - if xlator is using scheduler */ - struct sched_ops *sched_ops; /* Scheduler options */ - xlator_t *namespace; /* ptr to namespace xlator */ - xlator_t **xl_array; - gf_boolean_t optimist; - int16_t child_count; - int16_t num_child_up; - uint8_t self_heal; - uint8_t is_up; - uint64_t inode_generation; - gf_lock_t lock; -}; -typedef struct unify_private unify_private_t; - -struct unify_self_heal_struct { - uint8_t dir_checksum[NAME_MAX]; - uint8_t ns_dir_checksum[NAME_MAX]; - uint8_t file_checksum[NAME_MAX]; - uint8_t ns_file_checksum[NAME_MAX]; - off_t *offset_list; - int *count_list; - dir_entry_t **entry_list; -}; - - -struct _unify_local_t { - int32_t call_count; - int32_t op_ret; - int32_t op_errno; - mode_t mode; - off_t offset; - dev_t dev; - uid_t uid; - gid_t gid; - int32_t flags; - int32_t entry_count; - int32_t count; // dir_entry_t count; - fd_t *fd; - struct iatt stbuf; - struct iatt stpre; - struct iatt stpost; - struct statvfs statvfs_buf; - struct timespec tv[2]; - char *name; - int32_t revalidate; - - ino_t ia_ino; - nlink_t ia_nlink; - - dict_t *dict; - - int16_t *list; - int16_t *new_list; /* Used only in case of rename */ - int16_t index; - - int32_t failed; - int32_t return_eio; /* Used in case of different st-mode - present for a given path */ - - uint64_t inode_generation; /* used to store the per directory - * inode_generation. Got from inode's ctx - * of directory inodes - */ - - struct unify_self_heal_struct *sh_struct; - loc_t loc1, loc2; - - struct iatt poststbuf; - /* When not used for rename, old* - * are used as the attrs for the current - * parent directory. - */ - struct iatt oldpreparent; - struct iatt oldpostparent; - struct iatt newpreparent; - struct iatt newpostparent; - int32_t wbflags; -}; -typedef struct _unify_local_t unify_local_t; - -int32_t zr_unify_self_heal (call_frame_t *frame, - xlator_t *this, - unify_local_t *local); - -#endif /* _UNIFY_H */ |
