diff options
Diffstat (limited to 'xlators')
458 files changed, 190984 insertions, 114681 deletions
diff --git a/xlators/Makefile.am b/xlators/Makefile.am index 4c94f5e44..f60fa85ce 100644 --- a/xlators/Makefile.am +++ b/xlators/Makefile.am @@ -1,3 +1,4 @@ -SUBDIRS = cluster storage protocol performance debug features encryption mount nfs mgmt +SUBDIRS = cluster storage protocol performance debug features encryption mount nfs mgmt system \ + playground CLEANFILES = diff --git a/xlators/bindings/python/src/Makefile.am b/xlators/bindings/python/src/Makefile.am index c0b9141c6..90370d861 100644 --- a/xlators/bindings/python/src/Makefile.am +++ b/xlators/bindings/python/src/Makefile.am @@ -9,7 +9,7 @@ pythondir = $(xlatordir)/python python_so_SOURCES = python.c -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall \ +AM_CFLAGS = -fPIC $(GF_CPPFLAGS) -Wall \ -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles \ $(PYTHON_CPPLAGS) -DGLUSTER_PYTHON_PATH=\"$(pythondir)\" diff --git a/xlators/bindings/python/src/gluster.py b/xlators/bindings/python/src/gluster.py index ee0eb1310..337c983ec 100644 --- a/xlators/bindings/python/src/gluster.py +++ b/xlators/bindings/python/src/gluster.py @@ -1,19 +1,12 @@ -# Copyright (c) 2007 Chris AtLee <chris@atlee.ca> -# This file is part of GlusterFS. -# -# GlusterFS is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published -# by the Free Software Foundation; either version 3 of the License, -# or (at your option) any later version. -# -# GlusterFS is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. + +# Copyright (c) 2007-2012 Red Hat, Inc. <http://www.redhat.com> +# This file is part of GlusterFS. # -# You should have received a copy of the GNU General Public License -# along with this program. If not, see -# <http://www.gnu.org/licenses/>. +# This file is licensed to you under your choice of the GNU Lesser +# General Public License, version 3 or any later version (LGPLv3 or +# later), or the GNU General Public License, version 2 (GPLv2), in all +# cases as published by the Free Software Foundation. + from ctypes import * from glustertypes import * from glusterstack import * diff --git a/xlators/bindings/python/src/glusterstack.py b/xlators/bindings/python/src/glusterstack.py index ba24c8165..0c071ae98 100644 --- a/xlators/bindings/python/src/glusterstack.py +++ b/xlators/bindings/python/src/glusterstack.py @@ -1,19 +1,12 @@ -# Copyright (c) 2007 Chris AtLee <chris@atlee.ca> -# This file is part of GlusterFS. -# -# GlusterFS is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published -# by the Free Software Foundation; either version 3 of the License, -# or (at your option) any later version. -# -# GlusterFS is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. + +# Copyright (c) 2007-2012 Red Hat, Inc. <http://www.redhat.com> +# This file is part of GlusterFS. # -# You should have received a copy of the GNU General Public License -# along with this program. If not, see -# <http://www.gnu.org/licenses/>. +# This file is licensed to you under your choice of the GNU Lesser +# General Public License, version 3 or any later version (LGPLv3 or +# later), or the GNU General Public License, version 2 (GPLv2), in all +# cases as published by the Free Software Foundation. + from ctypes import * from glustertypes import * diff --git a/xlators/bindings/python/src/glustertypes.py b/xlators/bindings/python/src/glustertypes.py index e9069d07c..98437d22e 100644 --- a/xlators/bindings/python/src/glustertypes.py +++ b/xlators/bindings/python/src/glustertypes.py @@ -1,19 +1,12 @@ -# Copyright (c) 2007 Chris AtLee <chris@atlee.ca> -# This file is part of GlusterFS. -# -# GlusterFS is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published -# by the Free Software Foundation; either version 3 of the License, -# or (at your option) any later version. -# -# GlusterFS is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. + +# Copyright (c) 2007-2012 Red Hat, Inc. <http://www.redhat.com> +# This file is part of GlusterFS. # -# You should have received a copy of the GNU General Public License -# along with this program. If not, see -# <http://www.gnu.org/licenses/>. +# This file is licensed to you under your choice of the GNU Lesser +# General Public License, version 3 or any later version (LGPLv3 or +# later), or the GNU General Public License, version 2 (GPLv2), in all +# cases as published by the Free Software Foundation. + from ctypes import * import collections diff --git a/xlators/bindings/python/src/python.c b/xlators/bindings/python/src/python.c index c11323cda..9b96790de 100644 --- a/xlators/bindings/python/src/python.c +++ b/xlators/bindings/python/src/python.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2007-2009 Chris AtLee <chris@atlee.ca> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #include <Python.h> #ifndef _CONFIG_H @@ -45,7 +35,7 @@ python_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, - int32_t count, + int32_t count, off_t offset) { python_private_t *priv = (python_private_t *)this->private; @@ -148,7 +138,7 @@ init (xlator_t *this) Py_InitializeEx(0); if (!this->children) { - gf_log ("python", GF_LOG_ERROR, + gf_log ("python", GF_LOG_ERROR, "FATAL: python should have exactly one child"); return -1; } @@ -166,7 +156,7 @@ init (xlator_t *this) } priv->pInterp = Py_NewInterpreter(); - + // Adjust python's path PyObject *syspath = PySys_GetObject("path"); PyObject *path = PyString_FromString(GLUSTER_PYTHON_PATH); @@ -188,7 +178,7 @@ init (xlator_t *this) priv->pVectorType = PyObject_GetAttrString(priv->pGlusterModule, "iovec"); gf_log("python", GF_LOG_DEBUG, "Loading script...%s", priv->scriptname); - + priv->pScriptModule = AnonModule_FromFile(priv->scriptname); if (!priv->pScriptModule || PyErr_Occurred()) { @@ -217,7 +207,7 @@ init (xlator_t *this) return 0; } -void +void fini (xlator_t *this) { python_private_t *priv = (python_private_t*)(this->private); diff --git a/xlators/bindings/python/src/testxlator.py b/xlators/bindings/python/src/testxlator.py index 507455c85..59a991dca 100644 --- a/xlators/bindings/python/src/testxlator.py +++ b/xlators/bindings/python/src/testxlator.py @@ -1,19 +1,12 @@ -# Copyright (c) 2007 Chris AtLee <chris@atlee.ca> -# This file is part of GlusterFS. -# -# GlusterFS is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published -# by the Free Software Foundation; either version 3 of the License, -# or (at your option) any later version. -# -# GlusterFS is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see -# <http://www.gnu.org/licenses/>. +""" + Copyright (c) 2007-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +""" """ This is a test translator written in python. diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am index 4e4b4c752..35d18a6c0 100644 --- a/xlators/cluster/afr/src/Makefile.am +++ b/xlators/cluster/afr/src/Makefile.am @@ -1,20 +1,31 @@ xlator_LTLIBRARIES = afr.la pump.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster -afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c afr-self-heal-algorithm.c afr-lk-common.c +afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \ + afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c \ + afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c \ + afr-self-heal-algorithm.c afr-lk-common.c afr-self-heald.c \ + $(top_builddir)/xlators/lib/src/libxlator.c -afr_la_LDFLAGS = -module -avoidversion +afr_la_LDFLAGS = -module -avoid-version afr_la_SOURCES = $(afr_common_source) afr.c afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -pump_la_LDFLAGS = -module -avoidversion +pump_la_LDFLAGS = -module -avoid-version pump_la_SOURCES = $(afr_common_source) pump.c pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h afr-self-heal-algorithm.h pump.h afr-mem-types.h afr-common.c +noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h \ + afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h \ + afr-self-heal-algorithm.h pump.h afr-mem-types.h afr-common.c \ + afr-self-heald.h $(top_builddir)/xlators/lib/src/libxlator.h \ + $(top_builddir)/glusterfsd/src/glusterfsd.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/contrib/md5 -shared -nostartfiles $(GF_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) \ + -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ + -I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index bd4e9d66d..af01f2ef2 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -44,6 +35,7 @@ #include "compat.h" #include "byte-order.h" #include "statedump.h" +#include "inode.h" #include "fd.h" @@ -54,301 +46,827 @@ #include "afr-transaction.h" #include "afr-self-heal.h" #include "afr-self-heal-common.h" +#include "afr-self-heald.h" #include "pump.h" -#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000200000000ULL -#define AFR_ICTX_SPLIT_BRAIN_MASK 0x0000000100000000ULL +#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000100000000ULL #define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL +#define AFR_STATISTICS_HISTORY_SIZE 50 +int +afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, + gf_boolean_t fail_conflict); +void +afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count) +{ + int i = 0; + + for (i = 0; i < child_count; i++) + dst[i] = src[i]; +} + +void +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path) +{ + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + ret = dict_set_uint64 (xattr_req, priv->pending_key[i], + 3 * sizeof(int32_t)); + if (ret < 0) + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + path, priv->pending_key[i]); + /* 3 = data+metadata+entry */ + } + ret = dict_set_int32 (xattr_req, GF_GFIDLESS_LOOKUP, 1); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "%s: failed to set gfidless " + "lookup", path); + } +} + +int +afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this, + dict_t *xattr_req, loc_t *loc, void **gfid_req) +{ + int ret = -ENOMEM; + + GF_ASSERT (gfid_req); + + *gfid_req = NULL; + local->xattr_req = dict_new (); + if (!local->xattr_req) + goto out; + if (xattr_req) + dict_copy (xattr_req, local->xattr_req); + + afr_xattr_req_prepare (this, local->xattr_req, loc->path); + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_INODELK_COUNT); + } + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_ENTRYLK_COUNT); + } + + ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_PARENT_ENTRYLK); + } + + ret = dict_get_ptr (local->xattr_req, "gfid-req", gfid_req); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "%s: failed to get the gfid from dict", loc->path); + *gfid_req = NULL; + } else { + if (loc->parent != NULL) + dict_del (local->xattr_req, "gfid-req"); + } + ret = 0; +out: + return ret; +} + +void +afr_lookup_save_gfid (uuid_t dst, void* new, const loc_t *loc) +{ + inode_t *inode = NULL; + + inode = loc->inode; + if (inode && !uuid_is_null (inode->gfid)) + uuid_copy (dst, inode->gfid); + else if (!uuid_is_null (loc->gfid)) + uuid_copy (dst, loc->gfid); + else if (new && !uuid_is_null (new)) + uuid_copy (dst, new); +} + +int +afr_errno_count (int32_t *children, int *child_errno, + unsigned int child_count, int32_t op_errno) +{ + int i = 0; + int errno_count = 0; + int child = 0; + + for (i = 0; i < child_count; i++) { + if (children) { + child = children[i]; + if (child == -1) + break; + } else { + child = i; + } + if (child_errno[child] == op_errno) + errno_count++; + } + return errno_count; +} int32_t afr_set_dict_gfid (dict_t *dict, uuid_t gfid) { - int ret = 0; + int ret = 0; + uuid_t *pgfid = NULL; GF_ASSERT (gfid); - ret = dict_set_static_bin (dict, "gfid-req", gfid, 16); + pgfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char); + if (!pgfid) { + ret = -1; + goto out; + } + + uuid_copy (*pgfid, gfid); + + ret = dict_set_dynptr (dict, "gfid-req", pgfid, sizeof (uuid_t)); if (ret) - gf_log (THIS->name, GF_LOG_DEBUG, "gfid set failed"); + gf_log (THIS->name, GF_LOG_ERROR, "gfid set failed"); + +out: + if (ret && pgfid) + GF_FREE (pgfid); return ret; } -uint64_t -afr_is_split_brain (xlator_t *this, inode_t *inode) +void +afr_inode_ctx_destroy (afr_inode_ctx_t *ctx) { - int ret = 0; + if (!ctx) + return; + GF_FREE (ctx->fresh_children); + GF_FREE (ctx); +} + +afr_inode_ctx_t* +__afr_inode_ctx_get (inode_t *inode, xlator_t *this) +{ + int ret = 0; + uint64_t ctx_addr = 0; + afr_inode_ctx_t *ctx = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + ret = __inode_ctx_get (inode, this, &ctx_addr); + if (ret < 0) + ctx_addr = 0; + if (ctx_addr != 0) { + ctx = (afr_inode_ctx_t*) (long) ctx_addr; + goto out; + } + ctx = GF_CALLOC (1, sizeof (*ctx), + gf_afr_mt_inode_ctx_t); + if (!ctx) + goto fail; + ctx->fresh_children = GF_CALLOC (priv->child_count, + sizeof (*ctx->fresh_children), + gf_afr_mt_int32_t); + if (!ctx->fresh_children) + goto fail; + ret = __inode_ctx_put (inode, this, (uint64_t)ctx); + if (ret) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to " + "set the inode ctx (%s)", + uuid_utoa (inode->gfid)); + goto fail; + } + +out: + return ctx; - uint64_t ctx = 0; - uint64_t split_brain = 0; +fail: + afr_inode_ctx_destroy (ctx); + return NULL; +} - VALIDATE_OR_GOTO (inode, out); +afr_inode_ctx_t* +afr_inode_ctx_get (inode_t *inode, xlator_t *this) +{ + afr_inode_ctx_t *ctx = NULL; LOCK (&inode->lock); { - ret = __inode_ctx_get (inode, this, &ctx); + ctx = __afr_inode_ctx_get (inode, this); + } + UNLOCK (&inode->lock); + return ctx; +} - if (ret < 0) - goto unlock; +void +afr_inode_get_ctx_params (xlator_t *this, inode_t *inode, + afr_inode_params_t *params) +{ + GF_ASSERT (inode); + GF_ASSERT (params); + + afr_inode_ctx_t *ctx = NULL; + afr_private_t *priv = NULL; + int i = 0; + int32_t read_child = -1; + int32_t *fresh_children = NULL; - split_brain = ctx & AFR_ICTX_SPLIT_BRAIN_MASK; + priv = this->private; + LOCK (&inode->lock); + { + ctx = __afr_inode_ctx_get (inode, this); + if (!ctx) + goto unlock; + switch (params->op) { + case AFR_INODE_GET_READ_CTX: + fresh_children = params->u.read_ctx.children; + read_child = (int32_t)(ctx->masks & + AFR_ICTX_READ_CHILD_MASK); + params->u.read_ctx.read_child = read_child; + if (!fresh_children) + goto unlock; + for (i = 0; i < priv->child_count; i++) + fresh_children[i] = ctx->fresh_children[i]; + break; + case AFR_INODE_GET_OPENDIR_DONE: + params->u.value = _gf_false; + if (ctx->masks & AFR_ICTX_OPENDIR_DONE_MASK) + params->u.value = _gf_true; + break; + default: + GF_ASSERT (0); + break; + } } unlock: UNLOCK (&inode->lock); +} + +gf_boolean_t +afr_is_split_brain (xlator_t *this, inode_t *inode) +{ + afr_inode_ctx_t *ctx = NULL; + gf_boolean_t spb = _gf_false; + ctx = afr_inode_ctx_get (inode, this); + if (!ctx) + goto out; + if ((ctx->mdata_spb == SPB) || (ctx->data_spb == SPB)) + spb = _gf_true; out: - return split_brain; + return spb; } +gf_boolean_t +afr_is_opendir_done (xlator_t *this, inode_t *inode) +{ + afr_inode_params_t params = {0}; -void -afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set) + params.op = AFR_INODE_GET_OPENDIR_DONE; + afr_inode_get_ctx_params (this, inode, ¶ms); + return params.u.value; +} + +int32_t +afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children) { - uint64_t ctx = 0; - int ret = 0; + afr_inode_params_t params = {0}; - VALIDATE_OR_GOTO (inode, out); + params.op = AFR_INODE_GET_READ_CTX; + params.u.read_ctx.children = fresh_children; + afr_inode_get_ctx_params (this, inode, ¶ms); + return params.u.read_ctx.read_child; +} - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); +void +afr_inode_ctx_set_read_child (afr_inode_ctx_t *ctx, int32_t read_child) +{ + uint64_t remaining_mask = 0; + uint64_t mask = 0; - if (ret < 0) { - ctx = 0; - } + remaining_mask = (~AFR_ICTX_READ_CHILD_MASK & ctx->masks); + mask = (AFR_ICTX_READ_CHILD_MASK & read_child); + ctx->masks = remaining_mask | mask; +} - if (set) { - ctx = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx) - | (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_SPLIT_BRAIN_MASK); - } else { - ctx = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx); - } - __inode_ctx_put (inode, this, ctx); +void +afr_inode_ctx_set_read_ctx (afr_inode_ctx_t *ctx, int32_t read_child, + int32_t *fresh_children, int32_t child_count) +{ + int i = 0; + + afr_inode_ctx_set_read_child (ctx, read_child); + for (i = 0; i < child_count; i++) { + if (fresh_children) + ctx->fresh_children[i] = fresh_children[i]; + else + ctx->fresh_children[i] = -1; } - UNLOCK (&inode->lock); -out: - return; } +void +afr_inode_ctx_rm_stale_children (afr_inode_ctx_t *ctx, int32_t *stale_children, + int32_t child_count) +{ + int i = 0; + int32_t read_child = -1; -uint64_t -afr_is_opendir_done (xlator_t *this, inode_t *inode) + GF_ASSERT (stale_children); + for (i = 0; i < child_count; i++) { + if (stale_children[i] == -1) + break; + afr_children_rm_child (ctx->fresh_children, + stale_children[i], child_count); + } + read_child = (int32_t)(ctx->masks & AFR_ICTX_READ_CHILD_MASK); + if (!afr_is_child_present (ctx->fresh_children, child_count, + read_child)) + afr_inode_ctx_set_read_child (ctx, ctx->fresh_children[0]); +} + +void +afr_inode_ctx_set_opendir_done (afr_inode_ctx_t *ctx) { - int ret = 0; + uint64_t remaining_mask = 0; + uint64_t mask = 0; + + remaining_mask = (~AFR_ICTX_OPENDIR_DONE_MASK & ctx->masks); + mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_OPENDIR_DONE_MASK); + ctx->masks = remaining_mask | mask; +} - uint64_t ctx = 0; - uint64_t opendir_done = 0; +void +afr_inode_set_ctx_params (xlator_t *this, inode_t *inode, + afr_inode_params_t *params) +{ + GF_ASSERT (inode); + GF_ASSERT (params); - VALIDATE_OR_GOTO (inode, out); + afr_inode_ctx_t *ctx = NULL; + afr_private_t *priv = NULL; + int32_t read_child = -1; + int32_t *fresh_children = NULL; + int32_t *stale_children = NULL; + priv = this->private; LOCK (&inode->lock); { - ret = __inode_ctx_get (inode, this, &ctx); - - if (ret < 0) + ctx = __afr_inode_ctx_get (inode, this); + if (!ctx) goto unlock; - - opendir_done = ctx & AFR_ICTX_OPENDIR_DONE_MASK; + switch (params->op) { + case AFR_INODE_SET_READ_CTX: + read_child = params->u.read_ctx.read_child; + fresh_children = params->u.read_ctx.children; + afr_inode_ctx_set_read_ctx (ctx, read_child, + fresh_children, + priv->child_count); + break; + case AFR_INODE_RM_STALE_CHILDREN: + stale_children = params->u.read_ctx.children; + afr_inode_ctx_rm_stale_children (ctx, + stale_children, + priv->child_count); + break; + case AFR_INODE_SET_OPENDIR_DONE: + afr_inode_ctx_set_opendir_done (ctx); + break; + default: + GF_ASSERT (0); + break; + } } unlock: UNLOCK (&inode->lock); - -out: - return opendir_done; } +void +afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, + afr_spb_state_t data_spb) +{ + afr_inode_ctx_t *ctx = NULL; + + ctx = afr_inode_ctx_get (inode, this); + if (mdata_spb != DONT_KNOW) + ctx->mdata_spb = mdata_spb; + if (data_spb != DONT_KNOW) + ctx->data_spb = data_spb; +} void afr_set_opendir_done (xlator_t *this, inode_t *inode) { - uint64_t ctx = 0; - int ret = 0; + afr_inode_params_t params = {0}; - VALIDATE_OR_GOTO (inode, out); + params.op = AFR_INODE_SET_OPENDIR_DONE; + afr_inode_set_ctx_params (this, inode, ¶ms); +} - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); +void +afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, + int32_t *fresh_children) +{ + afr_inode_params_t params = {0}; + afr_private_t *priv = NULL; + + priv = this->private; + GF_ASSERT (read_child >= 0); + GF_ASSERT (fresh_children); + GF_ASSERT (afr_is_child_present (fresh_children, priv->child_count, + read_child)); + + params.op = AFR_INODE_SET_READ_CTX; + params.u.read_ctx.read_child = read_child; + params.u.read_ctx.children = fresh_children; + afr_inode_set_ctx_params (this, inode, ¶ms); +} + +void +afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, + int32_t *stale_children) +{ + afr_inode_params_t params = {0}; + + GF_ASSERT (stale_children); + + params.op = AFR_INODE_RM_STALE_CHILDREN; + params.u.read_ctx.children = stale_children; + afr_inode_set_ctx_params (this, inode, ¶ms); +} + +gf_boolean_t +afr_is_source_child (int32_t *sources, int32_t child_count, int32_t child) +{ + gf_boolean_t source_xattrs = _gf_false; + + GF_ASSERT (child < child_count); + + if ((child >= 0) && (child < child_count) && + sources[child]) { + source_xattrs = _gf_true; + } + return source_xattrs; +} + +gf_boolean_t +afr_is_child_present (int32_t *success_children, int32_t child_count, + int32_t child) +{ + gf_boolean_t success_child = _gf_false; + int i = 0; - if (ret < 0) { - ctx = 0; + GF_ASSERT (child < child_count); + + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + if (child == success_children[i]) { + success_child = _gf_true; + break; } + } + return success_child; +} - ctx = (~AFR_ICTX_OPENDIR_DONE_MASK & ctx) - | (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_OPENDIR_DONE_MASK); +gf_boolean_t +afr_is_read_child (int32_t *success_children, int32_t *sources, + int32_t child_count, int32_t child) +{ + gf_boolean_t success_child = _gf_false; + gf_boolean_t source = _gf_false; - __inode_ctx_put (inode, this, ctx); + if (child < 0) { + return _gf_false; } - UNLOCK (&inode->lock); + + GF_ASSERT (success_children); + GF_ASSERT (child_count > 0); + + success_child = afr_is_child_present (success_children, child_count, + child); + if (!success_child) + goto out; + if (NULL == sources) { + source = _gf_true; + goto out; + } + source = afr_is_source_child (sources, child_count, child); out: - return; + return (success_child && source); } +int32_t +afr_hash_child (int32_t *success_children, int32_t child_count, + unsigned int hmode, uuid_t gfid) +{ + uuid_t gfid_copy = {0,}; + pid_t pid; + + if (!hmode) { + return -1; + } + + if (gfid) { + uuid_copy(gfid_copy,gfid); + } + if (hmode > 1) { + /* + * Why getpid? Because it's one of the cheapest calls + * available - faster than gethostname etc. - and returns a + * constant-length value that's sure to be shorter than a UUID. + * It's still very unlikely to be the same across clients, so + * it still provides good mixing. We're not trying for + * perfection here. All we need is a low probability that + * multiple clients won't converge on the same subvolume. + */ + pid = getpid(); + memcpy (gfid_copy, &pid, sizeof(pid)); + } + + return SuperFastHash((char *)gfid_copy, + sizeof(gfid_copy)) % child_count; +} -uint64_t -afr_read_child (xlator_t *this, inode_t *inode) +/* If sources is NULL the xattrs are assumed to be of source for all + * success_children. + */ +int +afr_select_read_child_from_policy (int32_t *success_children, + int32_t child_count, int32_t prev_read_child, + int32_t config_read_child, int32_t *sources, + unsigned int hmode, uuid_t gfid) { - int ret = 0; + int32_t read_child = -1; + int i = 0; - uint64_t ctx = 0; - uint64_t read_child = 0; + GF_ASSERT (success_children); - VALIDATE_OR_GOTO (inode, out); + read_child = config_read_child; + if (afr_is_read_child (success_children, sources, child_count, + read_child)) + goto out; - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); + read_child = prev_read_child; + if (afr_is_read_child (success_children, sources, child_count, + read_child)) + goto out; - if (ret < 0) - goto unlock; + read_child = afr_hash_child (success_children, child_count, + hmode, gfid); + if (afr_is_read_child (success_children, sources, child_count, + read_child)) { + goto out; + } - read_child = ctx & AFR_ICTX_READ_CHILD_MASK; + for (i = 0; i < child_count; i++) { + read_child = success_children[i]; + if (read_child < 0) + break; + if (afr_is_read_child (success_children, sources, child_count, + read_child)) + goto out; } -unlock: - UNLOCK (&inode->lock); + read_child = -1; out: return read_child; } - +/* This function should be used when all the success_children are sources + */ void -afr_set_read_child (xlator_t *this, inode_t *inode, int32_t read_child) +afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, + int32_t *fresh_children, int32_t prev_read_child, + int32_t config_read_child, uuid_t gfid) { - uint64_t ctx = 0; - int ret = 0; + int read_child = -1; + afr_private_t *priv = NULL; - VALIDATE_OR_GOTO (inode, out); + priv = this->private; + read_child = afr_select_read_child_from_policy (fresh_children, + priv->child_count, + prev_read_child, + config_read_child, + NULL, + priv->hash_mode, gfid); + if (read_child >= 0) + afr_inode_set_read_ctx (this, inode, read_child, + fresh_children); +} - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); +/* afr_next_call_child () + * This is a common function used by all the read-type fops + * This function should not be called with the inode's read_children array. + * The fop's handler should make a copy of the inode's read_children, + * preferred read_child into the local vars, because while this function is + * in execution there is a chance for inode's read_ctx to change. + */ +int32_t +afr_next_call_child (int32_t *fresh_children, unsigned char *child_up, + size_t child_count, int32_t *last_index, + int32_t read_child) +{ + int next_index = 0; + int32_t next_call_child = -1; + + GF_ASSERT (last_index); + + next_index = *last_index; +retry: + next_index++; + if ((next_index >= child_count) || + (fresh_children[next_index] == -1)) + goto out; + if ((fresh_children[next_index] == read_child) || + (!child_up[fresh_children[next_index]])) + goto retry; + *last_index = next_index; + next_call_child = fresh_children[next_index]; +out: + return next_call_child; +} + + /* This function should not be called with the inode's read_children array. + * The fop's handler should make a copy of the inode's read_children, + * preferred read_child into the local vars, because while this function is + * in execution there is a chance for inode's read_ctx to change. + */ +int32_t +afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, + int32_t *fresh_children, + int32_t *call_child, int32_t *last_index) +{ + int ret = 0; + afr_private_t *priv = NULL; + int i = 0; - if (ret < 0) { - ctx = 0; + GF_ASSERT (child_up); + GF_ASSERT (call_child); + GF_ASSERT (last_index); + GF_ASSERT (fresh_children); + + if (read_child < 0) { + ret = -EIO; + goto out; + } + priv = this->private; + *call_child = -1; + *last_index = -1; + + if (child_up[read_child]) { + *call_child = read_child; + } else { + for (i = 0; i < priv->child_count; i++) { + if (fresh_children[i] == -1) + break; + if (child_up[fresh_children[i]]) { + *call_child = fresh_children[i]; + ret = 0; + break; + } } - ctx = (~AFR_ICTX_READ_CHILD_MASK & ctx) - | (AFR_ICTX_READ_CHILD_MASK & read_child); + if (*call_child == -1) { + ret = -ENOTCONN; + goto out; + } - __inode_ctx_put (inode, this, ctx); + *last_index = i; } - UNLOCK (&inode->lock); +out: + gf_log (this->name, GF_LOG_DEBUG, "Returning %d, call_child: %d, " + "last_index: %d", ret, *call_child, *last_index); + return ret; +} +void +afr_reset_xattr (dict_t **xattr, unsigned int child_count) +{ + unsigned int i = 0; + + if (!xattr) + goto out; + for (i = 0; i < child_count; i++) { + if (xattr[i]) { + dict_unref (xattr[i]); + xattr[i] = NULL; + } + } out: return; } - -/** - * afr_local_cleanup - cleanup everything in frame->local - */ +void +afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count) +{ + afr_reset_xattr (xattr, child_count); + GF_FREE (xattr); +} void afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) { afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; - int i = 0; - sh = &local->self_heal; priv = this->private; - if (sh->buf) - GF_FREE (sh->buf); + if (sh->data_sh_info && strcmp (sh->data_sh_info, "")) + GF_FREE (sh->data_sh_info); - if (sh->xattr) { - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } - } - GF_FREE (sh->xattr); - } + if (sh->metadata_sh_info && strcmp (sh->metadata_sh_info, "")) + GF_FREE (sh->metadata_sh_info); - if (sh->child_errno) - GF_FREE (sh->child_errno); + GF_FREE (sh->buf); - if (sh->pending_matrix) { - for (i = 0; i < priv->child_count; i++) { - GF_FREE (sh->pending_matrix[i]); - } - GF_FREE (sh->pending_matrix); - } + GF_FREE (sh->parentbufs); - if (sh->delta_matrix) { - for (i = 0; i < priv->child_count; i++) { - GF_FREE (sh->delta_matrix[i]); - } - GF_FREE (sh->delta_matrix); - } + if (sh->inode) + inode_unref (sh->inode); - if (sh->sources) - GF_FREE (sh->sources); + afr_xattr_array_destroy (sh->xattr, priv->child_count); - if (sh->success) - GF_FREE (sh->success); + GF_FREE (sh->child_errno); - if (sh->locked_nodes) - GF_FREE (sh->locked_nodes); + afr_matrix_cleanup (sh->pending_matrix, priv->child_count); + afr_matrix_cleanup (sh->delta_matrix, priv->child_count); - if (sh->healing_fd && !sh->healing_fd_opened) { + GF_FREE (sh->sources); + + GF_FREE (sh->success); + + GF_FREE (sh->locked_nodes); + + if (sh->healing_fd) { fd_unref (sh->healing_fd); sh->healing_fd = NULL; } - if (sh->linkname) - GF_FREE ((char *)sh->linkname); + GF_FREE ((char *)sh->linkname); + + GF_FREE (sh->success_children); + + GF_FREE (sh->fresh_children); + + GF_FREE (sh->fresh_parent_dirs); loc_wipe (&sh->parent_loc); + loc_wipe (&sh->lookup_loc); + + GF_FREE (sh->checksum); + + GF_FREE (sh->write_needed); + if (sh->healing_fd) + fd_unref (sh->healing_fd); } void afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) { - int i = 0; - afr_private_t * priv = NULL; + afr_private_t *priv = NULL; + int i = 0; priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (local->pending && local->pending[i]) - GF_FREE (local->pending[i]); - } - - GF_FREE (local->pending); - - if (local->internal_lock.locked_nodes) - GF_FREE (local->internal_lock.locked_nodes); + afr_matrix_cleanup (local->pending, priv->child_count); + afr_matrix_cleanup (local->transaction.txn_changelog, + priv->child_count); - if (local->internal_lock.inode_locked_nodes) - GF_FREE (local->internal_lock.inode_locked_nodes); + GF_FREE (local->internal_lock.locked_nodes); - if (local->internal_lock.entry_locked_nodes) - GF_FREE (local->internal_lock.entry_locked_nodes); + for (i = 0; local->internal_lock.inodelk[i].domain; i++) { + GF_FREE (local->internal_lock.inodelk[i].locked_nodes); + } - if (local->internal_lock.lower_locked_nodes) - GF_FREE (local->internal_lock.lower_locked_nodes); + GF_FREE (local->internal_lock.lower_locked_nodes); + afr_entry_lockee_cleanup (&local->internal_lock); - GF_FREE (local->transaction.child_errno); - GF_FREE (local->child_errno); + GF_FREE (local->transaction.pre_op); + GF_FREE (local->transaction.eager_lock); GF_FREE (local->transaction.basename); GF_FREE (local->transaction.new_basename); loc_wipe (&local->transaction.parent_loc); loc_wipe (&local->transaction.new_parent_loc); + + GF_FREE (local->transaction.postop_piggybacked); } void afr_local_cleanup (afr_local_t *local, xlator_t *this) { - int i; afr_private_t * priv = NULL; if (!local) @@ -369,16 +887,21 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (local->xattr_req) dict_unref (local->xattr_req); + if (local->dict) + dict_unref (local->dict); + + GF_FREE(local->replies); + GF_FREE (local->child_up); + GF_FREE (local->child_errno); + + GF_FREE (local->fresh_children); + { /* lookup */ if (local->cont.lookup.xattrs) { - for (i = 0; i < priv->child_count; i++) { - if (local->cont.lookup.xattrs[i]) { - dict_unref (local->cont.lookup.xattrs[i]); - local->cont.lookup.xattrs[i] = NULL; - } - } + afr_reset_xattr (local->cont.lookup.xattrs, + priv->child_count); GF_FREE (local->cont.lookup.xattrs); local->cont.lookup.xattrs = NULL; } @@ -390,16 +913,24 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (local->cont.lookup.inode) { inode_unref (local->cont.lookup.inode); } + + GF_FREE (local->cont.lookup.postparents); + + GF_FREE (local->cont.lookup.bufs); + + GF_FREE (local->cont.lookup.success_children); + + GF_FREE (local->cont.lookup.sources); + afr_matrix_cleanup (local->cont.lookup.pending_matrix, + priv->child_count); } { /* getxattr */ - if (local->cont.getxattr.name) - GF_FREE (local->cont.getxattr.name); + GF_FREE (local->cont.getxattr.name); } { /* lk */ - if (local->cont.lk.locked_nodes) - GF_FREE (local->cont.lk.locked_nodes); + GF_FREE (local->cont.lk.locked_nodes); } { /* create */ @@ -433,18 +964,40 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) dict_unref (local->cont.setxattr.dict); } + { /* fsetxattr */ + if (local->cont.fsetxattr.dict) + dict_unref (local->cont.fsetxattr.dict); + } + { /* removexattr */ GF_FREE (local->cont.removexattr.name); } - + { /* xattrop */ + if (local->cont.xattrop.xattr) + dict_unref (local->cont.xattrop.xattr); + } + { /* fxattrop */ + if (local->cont.fxattrop.xattr) + dict_unref (local->cont.fxattrop.xattr); + } { /* symlink */ GF_FREE (local->cont.symlink.linkpath); } { /* opendir */ - if (local->cont.opendir.checksum) - GF_FREE (local->cont.opendir.checksum); + GF_FREE (local->cont.opendir.checksum); + } + + { /* readdirp */ + if (local->cont.readdir.dict) + dict_unref (local->cont.readdir.dict); } + + if (local->xdata_req) + dict_unref (local->xdata_req); + + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); } @@ -465,107 +1018,227 @@ afr_frame_return (call_frame_t *frame) return call_count; } - -/** - * up_children_count - return the number of children that are up - */ - int -afr_up_children_count (int child_count, unsigned char *child_up) +afr_set_elem_count_get (unsigned char *elems, int child_count) { int i = 0; int ret = 0; for (i = 0; i < child_count; i++) - if (child_up[i]) + if (elems[i]) ret++; return ret; } +/** + * up_children_count - return the number of children that are up + */ -ino64_t -afr_itransform (ino64_t ino, int child_count, int child_index) +unsigned int +afr_up_children_count (unsigned char *child_up, unsigned int child_count) { - ino64_t scaled_ino = -1; - - if (ino == ((uint64_t) -1)) { - scaled_ino = ((uint64_t) -1); - goto out; - } - - scaled_ino = (ino * child_count) + child_index; + return afr_set_elem_count_get (child_up, child_count); +} -out: - return scaled_ino; +unsigned int +afr_locked_children_count (unsigned char *children, unsigned int child_count) +{ + return afr_set_elem_count_get (children, child_count); } +unsigned int +afr_pre_op_done_children_count (unsigned char *pre_op, + unsigned int child_count) +{ + return afr_set_elem_count_get (pre_op, child_count); +} -int -afr_deitransform_orig (ino64_t ino, int child_count) +gf_boolean_t +afr_is_fresh_lookup (loc_t *loc, xlator_t *this) { - int index = -1; + uint64_t ctx = 0; + int32_t ret = 0; - index = ino % child_count; + GF_ASSERT (loc); + GF_ASSERT (this); + GF_ASSERT (loc->inode); - return index; + ret = inode_ctx_get (loc->inode, this, &ctx); + if (0 == ret) + return _gf_false; + return _gf_true; } - -int -afr_deitransform (ino64_t ino, int child_count) +void +afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) { - return 0; + GF_ASSERT (loc); + GF_ASSERT (buf); + + uuid_copy (loc->gfid, buf->ia_gfid); + if (postparent) + uuid_copy (loc->pargfid, postparent->ia_gfid); } +/* + * Quota size xattrs are not maintained by afr. There is a + * possibility that they differ even when both the directory changelog xattrs + * suggest everything is fine. So if there is at least one 'source' check among + * the sources which has the maximum quota size. Otherwise check among all the + * available ones for maximum quota size. This way if there is a source and + * stale copies it always votes for the 'source'. + * */ -int -afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this) +static void +afr_handle_quota_size (afr_local_t *local, xlator_t *this, + dict_t *rsp_dict) { - afr_local_t *local = NULL; + int32_t *sources = NULL; + dict_t *xattr = NULL; + data_t *max_data = NULL; + int64_t max_quota_size = -1; + data_t *data = NULL; + int64_t *size = NULL; + int64_t quota_size = -1; + afr_private_t *priv = NULL; + int i = 0; + int ret = -1; + gf_boolean_t source_present = _gf_false; + + priv = this->private; + sources = local->cont.lookup.sources; + + if (rsp_dict == NULL) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "%s: Invalid " + "response dictionary", local->loc.path); + return; + } - local = frame->local; + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source_present = _gf_true; + break; + } + } - if (local->govinda_gOvinda) { - afr_set_split_brain (this, local->cont.lookup.inode, _gf_true); + for (i = 0; i < priv->child_count; i++) { + /* + * If there is at least one source lets check + * for maximum quota sizes among sources, otherwise take the + * maximum of the ones present to be on the safer side. + */ + if (source_present && !sources[i]) + continue; + + xattr = local->cont.lookup.xattrs[i]; + if (!xattr) + continue; + + data = dict_get (xattr, QUOTA_SIZE_KEY); + if (!data) + continue; + + size = (int64_t*)data->data; + quota_size = ntoh64(*size); + gf_log (this->name, GF_LOG_DEBUG, "%s: %d, size: %"PRId64, + local->loc.path, i, quota_size); + if (quota_size > max_quota_size) { + if (max_data) + data_unref (max_data); + + max_quota_size = quota_size; + max_data = data_ref (data); + } } - AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, - local->cont.lookup.inode, - &local->cont.lookup.buf, - local->cont.lookup.xattr, - &local->cont.lookup.postparent); + if (max_data) { + ret = dict_set (rsp_dict, QUOTA_SIZE_KEY, max_data); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " + "quota size", local->loc.path); + } - return 0; + data_unref (max_data); + } } - -static void -afr_lookup_collect_xattr (afr_local_t *local, xlator_t *this, - int child_index, dict_t *xattr) +int +afr_lookup_build_response_params (afr_local_t *local, xlator_t *this) { - uint32_t inodelk_count = 0; - uint32_t entrylk_count = 0; + struct iatt *buf = NULL; + struct iatt *postparent = NULL; + dict_t **xattr = NULL; + int32_t *success_children = NULL; + int32_t *sources = NULL; + afr_private_t *priv = NULL; + int32_t read_child = -1; + int ret = 0; + int i = 0; - int ret = 0; + GF_ASSERT (local); - if (afr_sh_has_metadata_pending (xattr, child_index, this)) { - local->self_heal.need_metadata_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "metadata self-heal is pending for %s.", - local->loc.path); - } + buf = &local->cont.lookup.buf; + postparent = &local->cont.lookup.postparent; + xattr = &local->cont.lookup.xattr; + priv = this->private; - if (afr_sh_has_entry_pending (xattr, child_index, this)) { - local->self_heal.need_entry_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "entry self-heal is pending for %s.", local->loc.path); + read_child = afr_inode_get_read_ctx (this, local->cont.lookup.inode, + local->fresh_children); + if (read_child < 0) { + ret = -1; + goto out; + } + success_children = local->cont.lookup.success_children; + sources = local->cont.lookup.sources; + memset (sources, 0, sizeof (*sources) * priv->child_count); + afr_children_intersection_get (local->fresh_children, success_children, + sources, priv->child_count); + if (!sources[read_child]) { + read_child = -1; + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + read_child = i; + break; + } + } + } + if (read_child < 0) { + ret = -1; + goto out; } - if (afr_sh_has_data_pending (xattr, child_index, this)) { - local->self_heal.need_data_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "data self-heal is pending for %s.", local->loc.path); + gf_log (this->name, GF_LOG_DEBUG, "Building lookup response from %d", + read_child); + if (!*xattr) + *xattr = dict_ref (local->cont.lookup.xattrs[read_child]); + + *buf = local->cont.lookup.bufs[read_child]; + *postparent = local->cont.lookup.postparents[read_child]; + + if (dict_get (local->xattr_req, QUOTA_SIZE_KEY)) + afr_handle_quota_size (local, this, *xattr); + + if (IA_INVAL == local->cont.lookup.inode->ia_type) { + /* fix for RT #602 */ + local->cont.lookup.inode->ia_type = buf->ia_type; } +out: + return ret; +} + +static void +afr_lookup_update_lk_counts (afr_local_t *local, xlator_t *this, + int child_index, dict_t *xattr) +{ + uint32_t inodelk_count = 0; + uint32_t entrylk_count = 0; + int ret = -1; + uint32_t parent_entrylk = 0; + + GF_ASSERT (local); + GF_ASSERT (this); + GF_ASSERT (xattr); + GF_ASSERT (child_index >= 0); ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT, &inodelk_count); @@ -576,504 +1249,1248 @@ afr_lookup_collect_xattr (afr_local_t *local, xlator_t *this, &entrylk_count); if (ret == 0) local->entrylk_count += entrylk_count; + ret = dict_get_uint32 (xattr, GLUSTERFS_PARENT_ENTRYLK, + &parent_entrylk); + if (!ret) + local->cont.lookup.parent_entrylk += parent_entrylk; } - +/* + * It's important to maintain a commutative property on do_*_self_heal and + * found*; once set, they must not be cleared by a subsequent iteration or + * call, so that they represent a logical OR of all iterations and calls + * regardless of child/key order. That allows the caller to call us multiple + * times without having to use a separate variable as a "reduce" accumulator. + */ static void -afr_lookup_self_heal_check (xlator_t *this, afr_local_t *local, - struct iatt *buf, struct iatt *lookup_buf) +afr_lookup_set_self_heal_params_by_xattr (afr_local_t *local, xlator_t *this, + dict_t *xattr) { - if (FILETYPE_DIFFERS (buf, lookup_buf)) { - /* mismatching filetypes with same name - */ + afr_private_t *priv = NULL; + int i = 0; + int ret = -1; + void *pending_raw = NULL; + int32_t *pending = NULL; - gf_log (this->name, GF_LOG_NORMAL, - "filetype differs for %s ", local->loc.path); + GF_ASSERT (local); + GF_ASSERT (this); + GF_ASSERT (xattr); - local->govinda_gOvinda = 1; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + ret = dict_get_ptr (xattr, priv->pending_key[i], + &pending_raw); + if (ret != 0) { + continue; + } + pending = pending_raw; + + if (pending[AFR_METADATA_TRANSACTION]) { + gf_log(this->name, GF_LOG_DEBUG, + "metadata self-heal is pending for %s.", + local->loc.path); + local->self_heal.do_metadata_self_heal = _gf_true; + } + + if (pending[AFR_ENTRY_TRANSACTION]) { + gf_log(this->name, GF_LOG_DEBUG, + "entry self-heal is pending for %s.", + local->loc.path); + local->self_heal.do_entry_self_heal = _gf_true; + } + + if (pending[AFR_DATA_TRANSACTION]) { + gf_log(this->name, GF_LOG_DEBUG, + "data self-heal is pending for %s.", + local->loc.path); + local->self_heal.do_data_self_heal = _gf_true; + } } +} +void +afr_lookup_check_set_metadata_split_brain (afr_local_t *local, xlator_t *this) +{ + int32_t *sources = NULL; + afr_private_t *priv = NULL; + int32_t subvol_status = 0; + int32_t *success_children = NULL; + dict_t **xattrs = NULL; + struct iatt *bufs = NULL; + int32_t **pending_matrix = NULL; + + priv = this->private; + + sources = GF_CALLOC (priv->child_count, sizeof (*sources), + gf_afr_mt_int32_t); + if (NULL == sources) + goto out; + success_children = local->cont.lookup.success_children; + xattrs = local->cont.lookup.xattrs; + bufs = local->cont.lookup.bufs; + pending_matrix = local->cont.lookup.pending_matrix; + afr_build_sources (this, xattrs, bufs, pending_matrix, + sources, success_children, AFR_METADATA_TRANSACTION, + &subvol_status, _gf_false); + if (subvol_status & SPLIT_BRAIN) + local->cont.lookup.possible_spb = _gf_true; +out: + GF_FREE (sources); +} + +static void +afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this, + struct iatt *buf, struct iatt *lookup_buf) +{ if (PERMISSION_DIFFERS (buf, lookup_buf)) { /* mismatching permissions */ - gf_log (this->name, GF_LOG_NORMAL, + gf_log (this->name, GF_LOG_DEBUG, "permissions differ for %s ", local->loc.path); - local->self_heal.need_metadata_self_heal = _gf_true; + local->self_heal.do_metadata_self_heal = _gf_true; } if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { /* mismatching permissions */ - local->self_heal.need_metadata_self_heal = _gf_true; - gf_log (this->name, GF_LOG_NORMAL, + local->self_heal.do_metadata_self_heal = _gf_true; + gf_log (this->name, GF_LOG_DEBUG, "ownership differs for %s ", local->loc.path); } if (SIZE_DIFFERS (buf, lookup_buf) && IA_ISREG (buf->ia_type)) { - gf_log (this->name, GF_LOG_NORMAL, + gf_log (this->name, GF_LOG_DEBUG, "size differs for %s ", local->loc.path); - local->self_heal.need_data_self_heal = _gf_true; + local->self_heal.do_data_self_heal = _gf_true; } + if (uuid_compare (buf->ia_gfid, lookup_buf->ia_gfid)) { + /* mismatching gfid */ + gf_log (this->name, GF_LOG_DEBUG, + "%s: gfid different on subvolume", local->loc.path); + } } +static void +afr_detect_self_heal_by_split_brain_status (afr_local_t *local, xlator_t *this) +{ + gf_boolean_t split_brain = _gf_false; + afr_self_heal_t *sh = NULL; + + sh = &local->self_heal; + + split_brain = afr_is_split_brain (this, local->cont.lookup.inode); + split_brain = split_brain || local->cont.lookup.possible_spb; + if ((local->success_count > 0) && split_brain && + IA_ISREG (local->cont.lookup.inode->ia_type)) { + sh->force_confirm_spb = _gf_true; + gf_log (this->name, GF_LOG_DEBUG, + "split brain detected during lookup of %s.", + local->loc.path); + } +} static void -afr_lookup_done (call_frame_t *frame, xlator_t *this, struct iatt *lookup_buf) +afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this) { - int unwind = 1; - int source = -1; - int up_count = 0; - char sh_type_str[256] = {0,}; + GF_ASSERT (local); + GF_ASSERT (this); + + if ((local->success_count > 0) && (local->enoent_count > 0)) { + local->self_heal.do_metadata_self_heal = _gf_true; + local->self_heal.do_data_self_heal = _gf_true; + local->self_heal.do_entry_self_heal = _gf_true; + local->self_heal.do_gfid_self_heal = _gf_true; + local->self_heal.do_missing_entry_self_heal = _gf_true; + gf_log(this->name, GF_LOG_DEBUG, + "entries are missing in lookup of %s.", + local->loc.path); + } - afr_private_t *priv = NULL; - afr_local_t *local = NULL; + return; +} - priv = this->private; - local = frame->local; +gf_boolean_t +afr_can_self_heal_proceed (afr_self_heal_t *sh, afr_private_t *priv) +{ + GF_ASSERT (sh); + GF_ASSERT (priv); + + if (sh->force_confirm_spb) + return _gf_true; + return (sh->do_gfid_self_heal + || sh->do_missing_entry_self_heal + || (afr_data_self_heal_enabled (priv->data_self_heal) && + sh->do_data_self_heal) + || (priv->metadata_self_heal && sh->do_metadata_self_heal) + || (priv->entry_self_heal && sh->do_entry_self_heal)); +} - local->cont.lookup.postparent.ia_ino = local->cont.lookup.parent_ino; +afr_transaction_type +afr_transaction_type_get (ia_type_t ia_type) +{ + afr_transaction_type type = AFR_METADATA_TRANSACTION; - if (local->cont.lookup.ino) { - local->cont.lookup.buf.ia_ino = local->cont.lookup.ino; - } + GF_ASSERT (ia_type != IA_INVAL); - up_count = afr_up_children_count (priv->child_count, priv->child_up); - if (up_count == 1) { - gf_log (this->name, GF_LOG_DEBUG, - "Only 1 child up - do not attempt to detect self heal"); - goto unwind; + if (IA_ISDIR (ia_type)) { + type = AFR_ENTRY_TRANSACTION; + } else if (IA_ISREG (ia_type)) { + type = AFR_DATA_TRANSACTION; } + return type; +} - if (local->op_ret == 0) { - /* KLUDGE: assuming DHT will not itransform in - revalidate */ - if (local->cont.lookup.inode->ino) { - local->cont.lookup.buf.ia_ino = - local->cont.lookup.inode->ino; - } +int +afr_lookup_select_read_child (afr_local_t *local, xlator_t *this, + int32_t *read_child) +{ + ia_type_t ia_type = IA_INVAL; + int32_t source = -1; + int ret = -1; + dict_t **xattrs = NULL; + int32_t *success_children = NULL; + afr_transaction_type type = AFR_METADATA_TRANSACTION; + uuid_t *gfid = NULL; + + GF_ASSERT (local); + GF_ASSERT (this); + GF_ASSERT (local->success_count > 0); + + success_children = local->cont.lookup.success_children; + /*We can take the success_children[0] only because we already + *handle the conflicting children other wise, we could select the + *read_child based on wrong file type + */ + ia_type = local->cont.lookup.bufs[success_children[0]].ia_type; + type = afr_transaction_type_get (ia_type); + xattrs = local->cont.lookup.xattrs; + gfid = &local->cont.lookup.buf.ia_gfid; + source = afr_lookup_select_read_child_by_txn_type (this, local, xattrs, + type, *gfid); + if (source < 0) { + gf_log (this->name, GF_LOG_DEBUG, "failed to select source " + "for %s", local->loc.path); + goto out; } - if (local->success_count && local->enoent_count) { - local->self_heal.need_metadata_self_heal = _gf_true; - local->self_heal.need_data_self_heal = _gf_true; - local->self_heal.need_entry_self_heal = _gf_true; - gf_log(this->name, GF_LOG_NORMAL, - "entries are missing in lookup of %s.", - local->loc.path); - } + gf_log (this->name, GF_LOG_DEBUG, "Source selected as %d for %s", + source, local->loc.path); + *read_child = source; + ret = 0; +out: + return ret; +} - if (local->success_count) { - /* check for split-brain case in previous lookup */ - if (afr_is_split_brain (this, - local->cont.lookup.inode)) { - local->self_heal.need_data_self_heal = _gf_true; - gf_log(this->name, GF_LOG_NORMAL, - "split brain detected during lookup of " - "%s.", local->loc.path); - } - } +static inline gf_boolean_t +afr_is_transaction_running (afr_local_t *local) +{ + GF_ASSERT (local->fop == GF_FOP_LOOKUP); + return ((local->inodelk_count > 0) || (local->entrylk_count > 0)); +} - if ((local->self_heal.need_metadata_self_heal - || local->self_heal.need_data_self_heal - || local->self_heal.need_entry_self_heal) - && ((!local->cont.lookup.is_revalidate) - || (local->op_ret != -1))) { +void +afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, + gf_boolean_t background, ia_type_t ia_type, char *reason, + void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, + xlator_t *this), + int (*unwind) (call_frame_t *frame, xlator_t *this, + int32_t op_ret, int32_t op_errno, + int32_t sh_failed)) +{ + afr_local_t *local = NULL; + char sh_type_str[256] = {0,}; + char *bg = ""; - if (local->inodelk_count || local->entrylk_count) { + GF_ASSERT (frame); + GF_ASSERT (this); + GF_ASSERT (inode); + GF_ASSERT (ia_type != IA_INVAL); - /* Someone else is doing self-heal on this file. - So just make a best effort to set the read-subvolume - and return */ + local = frame->local; + local->self_heal.background = background; + local->self_heal.type = ia_type; + local->self_heal.unwind = unwind; + local->self_heal.gfid_sh_success_cbk = gfid_sh_success_cbk; + + afr_self_heal_type_str_get (&local->self_heal, + sh_type_str, + sizeof (sh_type_str)); + + if (background) + bg = "background"; + gf_log (this->name, GF_LOG_DEBUG, + "%s %s self-heal triggered. path: %s, reason: %s", bg, + sh_type_str, local->loc.path, reason); + + afr_self_heal (frame, this, inode); +} - if (IA_ISREG (local->cont.lookup.inode->ia_type)) { - source = afr_self_heal_get_source (this, local, local->cont.lookup.xattrs); +unsigned int +afr_gfid_missing_count (const char *xlator_name, int32_t *success_children, + struct iatt *bufs, unsigned int child_count, + const char *path) +{ + unsigned int gfid_miss_count = 0; + int i = 0; + struct iatt *child1 = NULL; - if (source >= 0) { - afr_set_read_child (this, - local->cont.lookup.inode, - source); - } - } - } else { - if (!local->cont.lookup.inode->ia_type) { - /* fix for RT #602 */ - local->cont.lookup.inode->ia_type = - lookup_buf->ia_type; - } + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + child1 = &bufs[success_children[i]]; + if (uuid_is_null (child1->ia_gfid)) { + gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid is null" + " on subvolume %d", path, success_children[i]); + gfid_miss_count++; + } + } - local->self_heal.background = _gf_true; - local->self_heal.type = local->cont.lookup.buf.ia_type; - local->self_heal.unwind = afr_self_heal_lookup_unwind; + return gfid_miss_count; +} - unwind = 0; +static int +afr_lookup_gfid_missing_count (afr_local_t *local, xlator_t *this) +{ + int32_t *success_children = NULL; + afr_private_t *priv = NULL; + struct iatt *bufs = NULL; + int miss_count = 0; - afr_self_heal_type_str_get(&local->self_heal, - sh_type_str, - sizeof(sh_type_str)); + priv = this->private; + bufs = local->cont.lookup.bufs; + success_children = local->cont.lookup.success_children; + + miss_count = afr_gfid_missing_count (this->name, success_children, + bufs, priv->child_count, + local->loc.path); + return miss_count; +} - gf_log (this->name, GF_LOG_NORMAL, "background %s " - "self-heal triggered. path: %s", - sh_type_str, local->loc.path); +gf_boolean_t +afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, + unsigned int child_count, const char *path, + const char *xlator_name) +{ + gf_boolean_t conflicting = _gf_false; + int i = 0; + struct iatt *child1 = NULL; + struct iatt *child2 = NULL; + uuid_t *gfid = NULL; - afr_self_heal (frame, this); + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + child1 = &bufs[success_children[i]]; + if ((!gfid) && (!uuid_is_null (child1->ia_gfid))) + gfid = &child1->ia_gfid; + + if (i == 0) + continue; + + child2 = &bufs[success_children[i-1]]; + if (FILETYPE_DIFFERS (child1, child2)) { + gf_log (xlator_name, GF_LOG_DEBUG, "%s: filetype " + "differs on subvolumes (%d, %d)", path, + success_children[i-1], success_children[i]); + conflicting = _gf_true; + goto out; + } + if (!gfid || uuid_is_null (child1->ia_gfid)) + continue; + if (uuid_compare (*gfid, child1->ia_gfid)) { + gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid differs" + " on subvolume %d", path, success_children[i]); + conflicting = _gf_true; + goto out; } } +out: + return conflicting; +} + +/* afr_update_gfid_from_iatts: This function should be called only if the + * iatts are not conflicting. + */ +void +afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs, + int32_t *success_children, unsigned int child_count) +{ + uuid_t *gfid = NULL; + int i = 0; + int child = 0; -unwind: - if (unwind) { - AFR_STACK_UNWIND (lookup, frame, local->op_ret, - local->op_errno, - local->cont.lookup.inode, - &local->cont.lookup.buf, - local->cont.lookup.xattr, - &local->cont.lookup.postparent); + for (i = 0; i < child_count; i++) { + child = success_children[i]; + if (child == -1) + break; + if ((!gfid) && (!uuid_is_null (bufs[child].ia_gfid))) { + gfid = &bufs[child].ia_gfid; + } else if (gfid && (!uuid_is_null (bufs[child].ia_gfid))) { + if (uuid_compare (*gfid, bufs[child].ia_gfid)) { + GF_ASSERT (0); + goto out; + } + } } + if (gfid && (!uuid_is_null (*gfid))) + uuid_copy (uuid, *gfid); +out: + return; } +static gf_boolean_t +afr_lookup_conflicting_entries (afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = NULL; + gf_boolean_t conflict = _gf_false; -/* - * During a lookup, some errors are more "important" than - * others in that they must be given higher priority while - * returning to the user. - * - * The hierarchy is ESTALE > ENOENT > others - * - */ + priv = this->private; + conflict = afr_conflicting_iattrs (local->cont.lookup.bufs, + local->cont.lookup.success_children, + priv->child_count, local->loc.path, + this->name); + return conflict; +} -static gf_boolean_t -__error_more_important (int32_t old_errno, int32_t new_errno) +gf_boolean_t +afr_open_only_data_self_heal (char *data_self_heal) { - gf_boolean_t ret = _gf_true; + return !strcmp (data_self_heal, "open"); +} - /* Nothing should ever overwrite ESTALE */ - if (old_errno == ESTALE) - ret = _gf_false; +gf_boolean_t +afr_data_self_heal_enabled (char *data_self_heal) +{ + gf_boolean_t enabled = _gf_false; - /* Nothing should overwrite ENOENT, except ESTALE */ - else if ((old_errno == ENOENT) && (new_errno != ESTALE)) - ret = _gf_false; + if (gf_string2boolean (data_self_heal, &enabled) == -1) { + enabled = !strcmp (data_self_heal, "open"); + GF_ASSERT (enabled); + } - return ret; + return enabled; } +static void +afr_lookup_set_self_heal_params (afr_local_t *local, xlator_t *this) +{ + int i = 0; + struct iatt *bufs = NULL; + dict_t **xattr = NULL; + afr_private_t *priv = NULL; + int32_t child1 = -1; + int32_t child2 = -1; + afr_self_heal_t *sh = NULL; + + priv = this->private; + sh = &local->self_heal; + + afr_detect_self_heal_by_lookup_status (local, this); + + if (afr_lookup_gfid_missing_count (local, this)) + local->self_heal.do_gfid_self_heal = _gf_true; + + if (_gf_true == afr_lookup_conflicting_entries (local, this)) + local->self_heal.do_missing_entry_self_heal = _gf_true; + else + afr_update_gfid_from_iatts (local->self_heal.sh_gfid_req, + local->cont.lookup.bufs, + local->cont.lookup.success_children, + priv->child_count); + + bufs = local->cont.lookup.bufs; + for (i = 1; i < local->success_count; i++) { + child1 = local->cont.lookup.success_children[i-1]; + child2 = local->cont.lookup.success_children[i]; + afr_detect_self_heal_by_iatt (local, this, + &bufs[child1], &bufs[child2]); + } + + xattr = local->cont.lookup.xattrs; + for (i = 0; i < local->success_count; i++) { + child1 = local->cont.lookup.success_children[i]; + afr_lookup_set_self_heal_params_by_xattr (local, this, + xattr[child1]); + } + if (afr_open_only_data_self_heal (priv->data_self_heal)) + sh->do_data_self_heal = _gf_false; + if (sh->do_metadata_self_heal) + afr_lookup_check_set_metadata_split_brain (local, this); + afr_detect_self_heal_by_split_brain_status (local, this); +} int -afr_fresh_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) +afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, + int32_t op_ret, int32_t op_errno, + int32_t sh_failed) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - struct iatt * lookup_buf = NULL; + afr_local_t *local = NULL; + int ret = -1; + dict_t *xattr = NULL; - int call_count = -1; - int child_index = -1; - int first_up_child = -1; + local = frame->local; - child_index = (long) cookie; + if (op_ret == -1) { + local->op_ret = -1; + local->op_errno = afr_most_important_error(local->op_errno, + op_errno, _gf_true); + + goto out; + } else { + local->op_ret = 0; + } + + afr_lookup_done_success_action (frame, this, _gf_true); + xattr = local->cont.lookup.xattr; + if (xattr) { + ret = dict_set_int32 (xattr, "sh-failed", sh_failed); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " + "sh-failed to %d", local->loc.path, sh_failed); + + if (local->self_heal.actual_sh_started == _gf_true && + sh_failed == 0) { + ret = dict_set_int32 (xattr, "actual-sh-done", 1); + if (ret) + gf_log(this->name, GF_LOG_ERROR, "%s: Failed to" + " set actual-sh-done to %d", + local->loc.path, + local->self_heal.actual_sh_started); + } + } +out: + AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, + local->cont.lookup.inode, &local->cont.lookup.buf, + local->cont.lookup.xattr, + &local->cont.lookup.postparent); + + return 0; +} + +//TODO: At the moment only lookup needs this, so not doing any checks, in the +// future we will have to do fop specific operations +void +afr_post_gfid_sh_success (call_frame_t *sh_frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_local_t *sh_local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + int i = 0; + struct iatt *lookup_bufs = NULL; + struct iatt *lookup_parentbufs = NULL; + + sh_local = sh_frame->local; + sh = &sh_local->self_heal; + local = sh->orig_frame->local; + lookup_bufs = local->cont.lookup.bufs; + lookup_parentbufs = local->cont.lookup.postparents; priv = this->private; - LOCK (&frame->lock); - { - local = frame->local; + memcpy (lookup_bufs, sh->buf, priv->child_count * sizeof (*sh->buf)); + memcpy (lookup_parentbufs, sh->parentbufs, + priv->child_count * sizeof (*sh->parentbufs)); - lookup_buf = &local->cont.lookup.buf; + afr_reset_xattr (local->cont.lookup.xattrs, priv->child_count); + if (local->cont.lookup.xattr) { + dict_unref (local->cont.lookup.xattr); + local->cont.lookup.xattr = NULL; + } - if (op_ret == -1) { - if (op_errno == ENOENT) - local->enoent_count++; + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) + local->cont.lookup.xattrs[i] = dict_ref (sh->xattr[i]); + } - if (__error_more_important (local->op_errno, op_errno)) - local->op_errno = op_errno; + afr_reset_children (local->cont.lookup.success_children, + priv->child_count); + afr_children_copy (local->cont.lookup.success_children, + sh->fresh_children, priv->child_count); +} - if (local->op_errno == ESTALE) { - local->op_ret = -1; - } +static void +afr_lookup_perform_self_heal (call_frame_t *frame, xlator_t *this, + gf_boolean_t *sh_launched) +{ + unsigned int up_count = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + char *reason = NULL; - goto unlock; - } + GF_ASSERT (sh_launched); + *sh_launched = _gf_false; + priv = this->private; + local = frame->local; - afr_lookup_collect_xattr (local, this, child_index, xattr); + up_count = afr_up_children_count (local->child_up, priv->child_count); + if (up_count == 1) { + gf_log (this->name, GF_LOG_DEBUG, + "Only 1 child up - do not attempt to detect self heal"); + goto out; + } - first_up_child = afr_first_up_child (priv); + afr_lookup_set_self_heal_params (local, this); + if (afr_can_self_heal_proceed (&local->self_heal, priv)) { + if (afr_is_transaction_running (local) && + (!local->allow_sh_for_running_transaction)) + goto out; + + reason = "lookup detected pending operations"; + afr_launch_self_heal (frame, this, local->cont.lookup.inode, + _gf_true, local->cont.lookup.buf.ia_type, + reason, afr_post_gfid_sh_success, + afr_self_heal_lookup_unwind); + *sh_launched = _gf_true; + } +out: + return; +} + +void +afr_get_fresh_children (int32_t *success_children, int32_t *sources, + int32_t *fresh_children, unsigned int child_count) +{ + unsigned int i = 0; + unsigned int j = 0; - if (child_index == first_up_child) { - local->cont.lookup.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - first_up_child); + GF_ASSERT (success_children); + GF_ASSERT (sources); + GF_ASSERT (fresh_children); + + afr_reset_children (fresh_children, child_count); + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + if (afr_is_read_child (success_children, sources, child_count, + success_children[i])) { + fresh_children[j] = success_children[i]; + j++; } + } +} - if (local->success_count == 0) { - if (local->op_errno != ESTALE) - local->op_ret = op_ret; +static int +afr_lookup_set_read_ctx (afr_local_t *local, xlator_t *this, int32_t read_child) +{ + afr_private_t *priv = NULL; - local->cont.lookup.inode = inode_ref (inode); - local->cont.lookup.xattr = dict_ref (xattr); - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - local->cont.lookup.postparent = *postparent; + GF_ASSERT (read_child >= 0); - if (priv->first_lookup && inode->ino == 1) { - gf_log (this->name, GF_LOG_NORMAL, - "added root inode"); - priv->root_inode = inode_ref (inode); - priv->first_lookup = 0; - } + priv = this->private; + afr_get_fresh_children (local->cont.lookup.success_children, + local->cont.lookup.sources, + local->fresh_children, priv->child_count); + afr_inode_set_read_ctx (this, local->cont.lookup.inode, read_child, + local->fresh_children); - *lookup_buf = *buf; + return 0; +} - lookup_buf->ia_ino = afr_itransform (buf->ia_ino, - priv->child_count, - child_index); - if (priv->read_child >= 0) { - afr_set_read_child (this, - local->cont.lookup.inode, - priv->read_child); - } else { - afr_set_read_child (this, - local->cont.lookup.inode, - child_index); - } +int +afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, + gf_boolean_t fail_conflict) +{ + int32_t read_child = -1; + int32_t ret = -1; + afr_local_t *local = NULL; + gf_boolean_t fresh_lookup = _gf_false; - } else { - afr_lookup_self_heal_check (this, local, buf, lookup_buf); - - if (child_index == local->read_child_index) { - /* - lookup has succeeded on the read child. - So use its inode number - */ - if (local->cont.lookup.xattr) - dict_unref (local->cont.lookup.xattr); - - local->cont.lookup.xattr = dict_ref (xattr); - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - local->cont.lookup.postparent = *postparent; - - *lookup_buf = *buf; - - if (priv->read_child >= 0) { - afr_set_read_child (this, - local->cont.lookup.inode, - priv->read_child); - } else { - afr_set_read_child (this, - local->cont.lookup.inode, - local->read_child_index); - } - } + local = frame->local; + fresh_lookup = local->cont.lookup.fresh_lookup; - } + if (local->loc.parent == NULL) + fail_conflict = _gf_true; - local->success_count++; + if (afr_lookup_conflicting_entries (local, this)) { + if (fail_conflict == _gf_false) + ret = 0; + goto out; } -unlock: - UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); + ret = afr_lookup_select_read_child (local, this, &read_child); + if (!afr_is_transaction_running (local) || fresh_lookup) { + if (read_child < 0) + goto out; - if (call_count == 0) { - afr_lookup_done (frame, this, lookup_buf); + ret = afr_lookup_set_read_ctx (local, this, read_child); + if (ret) + goto out; } - return 0; -} + ret = afr_lookup_build_response_params (local, this); + if (ret) + goto out; + afr_update_loc_gfids (&local->loc, + &local->cont.lookup.buf, + &local->cont.lookup.postparent); + ret = 0; +out: + if (ret) { + local->op_ret = -1; + local->op_errno = EIO; + } + return ret; +} int -afr_revalidate_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) +afr_lookup_get_latest_subvol (afr_local_t *local, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - struct iatt * lookup_buf = NULL; + afr_private_t *priv = NULL; + int32_t *success_children = NULL; + struct iatt *bufs = NULL; + int i = 0; + int child = 0; + int lsubvol = -1; - int call_count = -1; - int child_index = -1; - int first_up_child = -1; + priv = this->private; + success_children = local->cont.lookup.success_children; + bufs = local->cont.lookup.bufs; + for (i = 0; i < priv->child_count; i++) { + child = success_children[i]; + if (child == -1) + break; + if (uuid_is_null (bufs[child].ia_gfid)) + continue; + if (lsubvol < 0) { + lsubvol = child; + } else if (bufs[lsubvol].ia_ctime < bufs[child].ia_ctime) { + lsubvol = child; + } else if ((bufs[lsubvol].ia_ctime == bufs[child].ia_ctime) && + (bufs[lsubvol].ia_ctime_nsec < bufs[child].ia_ctime_nsec)) { + lsubvol = child; + } + } + return lsubvol; +} + +void +afr_lookup_mark_other_entries_stale (afr_local_t *local, xlator_t *this, + int subvol) +{ + afr_private_t *priv = NULL; + int32_t *success_children = NULL; + struct iatt *bufs = NULL; + int i = 0; + int child = 0; - child_index = (long) cookie; priv = this->private; + success_children = local->cont.lookup.success_children; + bufs = local->cont.lookup.bufs; + memcpy (local->fresh_children, success_children, + sizeof (*success_children) * priv->child_count); + for (i = 0; i < priv->child_count; i++) { + child = local->fresh_children[i]; + if (child == -1) + break; + if (child == subvol) + continue; + if (uuid_is_null (bufs[child].ia_gfid) && + (bufs[child].ia_type == bufs[subvol].ia_type)) + continue; + afr_children_rm_child (success_children, child, + priv->child_count); + local->success_count--; + } + afr_reset_children (local->fresh_children, priv->child_count); +} - LOCK (&frame->lock); - { - local = frame->local; +void +afr_succeed_lookup_on_latest_iatt (afr_local_t *local, xlator_t *this) +{ + int lsubvol = 0; - lookup_buf = &local->cont.lookup.buf; + if (!afr_lookup_conflicting_entries (local, this)) + goto out; - if (op_ret == -1) { - if (op_errno == ENOENT) - local->enoent_count++; + lsubvol = afr_lookup_get_latest_subvol (local, this); + if (lsubvol < 0) + goto out; + afr_lookup_mark_other_entries_stale (local, this, lsubvol); +out: + return; +} - if (__error_more_important (local->op_errno, op_errno)) - local->op_errno = op_errno; +gf_boolean_t +afr_is_entry_possibly_under_creation (afr_local_t *local, xlator_t *this) +{ + /* + * We need to perform this test in lookup done and treat on going + * create/DELETE as ENOENT. + * Reason: + Multiple clients A, B and C are attempting 'mkdir -p /mnt/a/b/c' + + 1 Client A is in the middle of mkdir(/a). It has acquired lock. + It has performed mkdir(/a) on one subvol, and second one is still + in progress + 2 Client B performs a lookup, sees directory /a on one, + ENOENT on the other, succeeds lookup. + 3 Client B performs lookup on /a/b on both subvols, both return ENOENT + (one subvol because /a/b does not exist, another because /a + itself does not exist) + 4 Client B proceeds to mkdir /a/b. It obtains entrylk on inode=/a with + basename=b on one subvol, but fails on other subvol as /a is yet to + be created by Client A. + 5 Client A finishes mkdir of /a on other subvol + 6 Client C also attempts to create /a/b, lookup returns ENOENT on + both subvols. + 7 Client C tries to obtain entrylk on on inode=/a with basename=b, + obtains on one subvol (where B had failed), and waits for B to unlock + on other subvol. + 8 Client B finishes mkdir() on one subvol with GFID-1 and completes + transaction and unlocks + 9 Client C gets the lock on the second subvol, At this stage second + subvol already has /a/b created from Client B, but Client C does not + check that in the middle of mkdir transaction + 10 Client C attempts mkdir /a/b on both subvols. It succeeds on + ONLY ONE (where Client B could not get lock because of + missing parent /a dir) with GFID-2, and gets EEXIST from ONE subvol. + This way we have /a/b in GFID mismatch. One subvol got GFID-1 because + Client B performed transaction on only one subvol (because entrylk() + could not be obtained on second subvol because of missing parent dir -- + caused by premature/speculative succeeding of lookup() on /a when locks + are detected). Other subvol gets GFID-2 from Client C because while + it was waiting for entrylk() on both subvols, Client B was in the + middle of creating mkdir() on only one subvol, and Client C does not + "expect" this when it is between lock() and pre-op()/op() phase of the + transaction. + */ + if (local->cont.lookup.parent_entrylk && local->enoent_count) + return _gf_true; + + return _gf_false; +} - if (local->op_errno == ESTALE) { - local->op_ret = -1; - } - goto unlock; - } +static void +afr_lookup_done (call_frame_t *frame, xlator_t *this) +{ + int unwind = 1; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int ret = -1; + gf_boolean_t sh_launched = _gf_false; + gf_boolean_t fail_conflict = _gf_false; + int gfid_miss_count = 0; + int enotconn_count = 0; + int up_children_count = 0; - afr_lookup_collect_xattr (local, this, child_index, xattr); + priv = this->private; + local = frame->local; - first_up_child = afr_first_up_child (priv); + if (afr_is_entry_possibly_under_creation (local, this)) { + local->op_ret = -1; + local->op_errno = ENOENT; + goto unwind; + } - if (child_index == first_up_child) { - local->cont.lookup.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - first_up_child); - } + if (local->op_ret < 0) + goto unwind; - /* in case of revalidate, we need to send stat of the - * child whose stat was sent during the first lookup. - * (so that time stamp does not vary with revalidate. - * in case it is down, stat of the fist success will - * be replied */ + if (local->cont.lookup.parent_entrylk && local->success_count > 1) + afr_succeed_lookup_on_latest_iatt (local, this); + + gfid_miss_count = afr_lookup_gfid_missing_count (local, this); + up_children_count = afr_up_children_count (local->child_up, + priv->child_count); + enotconn_count = priv->child_count - up_children_count; + if ((gfid_miss_count == local->success_count) && + (enotconn_count > 0)) { + local->op_ret = -1; + local->op_errno = EIO; + gf_log (this->name, GF_LOG_ERROR, "Failing lookup for %s, " + "LOOKUP on a file without gfid is not allowed when " + "some of the children are down", local->loc.path); + goto unwind; + } - /* inode number should be preserved across revalidates */ + if ((gfid_miss_count == local->success_count) && + uuid_is_null (local->cont.lookup.gfid_req)) { + local->op_ret = -1; + local->op_errno = ENODATA; + gf_log (this->name, GF_LOG_ERROR, "%s: No gfid present", + local->loc.path); + goto unwind; + } - if (local->success_count == 0) { - if (local->op_errno != ESTALE) - local->op_ret = op_ret; + if (gfid_miss_count && uuid_is_null (local->cont.lookup.gfid_req)) + fail_conflict = _gf_true; + ret = afr_lookup_done_success_action (frame, this, fail_conflict); + if (ret) + goto unwind; + uuid_copy (local->self_heal.sh_gfid_req, local->cont.lookup.gfid_req); - local->cont.lookup.inode = inode_ref (inode); - local->cont.lookup.xattr = dict_ref (xattr); - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - local->cont.lookup.postparent = *postparent; + afr_lookup_perform_self_heal (frame, this, &sh_launched); + if (sh_launched) { + unwind = 0; + goto unwind; + } - *lookup_buf = *buf; + unwind: + if (unwind) { + AFR_STACK_UNWIND (lookup, frame, local->op_ret, + local->op_errno, local->cont.lookup.inode, + &local->cont.lookup.buf, + local->cont.lookup.xattr, + &local->cont.lookup.postparent); + } +} - lookup_buf->ia_ino = afr_itransform (buf->ia_ino, - priv->child_count, - child_index); +/* + * During a lookup, some errors are more "important" than + * others in that they must be given higher priority while + * returning to the user. + * + * The hierarchy is ESTALE > EIO > ENOENT > others + */ +int32_t +afr_most_important_error(int32_t old_errno, int32_t new_errno, + gf_boolean_t eio) +{ + if (old_errno == ESTALE || new_errno == ESTALE) + return ESTALE; + if (eio && (old_errno == EIO || new_errno == EIO)) + return EIO; + if (old_errno == ENOENT || new_errno == ENOENT) + return ENOENT; + + return new_errno; +} - if (priv->read_child >= 0) { - afr_set_read_child (this, - local->cont.lookup.inode, - priv->read_child); - } else { - afr_set_read_child (this, - local->cont.lookup.inode, - child_index); - } +int32_t +afr_resultant_errno_get (int32_t *children, + int *child_errno, unsigned int child_count) +{ + int i = 0; + int32_t op_errno = 0; + int child = 0; + for (i = 0; i < child_count; i++) { + if (children) { + child = children[i]; + if (child == -1) + break; } else { - afr_lookup_self_heal_check (this, local, buf, lookup_buf); + child = i; + } + op_errno = afr_most_important_error(op_errno, + child_errno[child], + _gf_false); + } + return op_errno; +} - if (child_index == local->read_child_index) { +static void +afr_lookup_handle_error (afr_local_t *local, int32_t op_ret, int32_t op_errno) +{ + GF_ASSERT (local); + if (op_errno == ENOENT) + local->enoent_count++; - /* - lookup has succeeded on the read child. - So use its inode number - */ + local->op_errno = afr_most_important_error(local->op_errno, op_errno, + _gf_false); - if (local->cont.lookup.xattr) - dict_unref (local->cont.lookup.xattr); + if (local->op_errno == ESTALE) { + local->op_ret = -1; + } +} - local->cont.lookup.xattr = dict_ref (xattr); - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - local->cont.lookup.postparent = *postparent; +static void +afr_set_root_inode_on_first_lookup (afr_local_t *local, xlator_t *this, + inode_t *inode) +{ + afr_private_t *priv = NULL; + GF_ASSERT (inode); - *lookup_buf = *buf; + if (!__is_root_gfid (inode->gfid)) + goto out; + if (!afr_is_fresh_lookup (&local->loc, this)) + goto out; + priv = this->private; + if ((priv->first_lookup)) { + gf_log (this->name, GF_LOG_INFO, "added root inode"); + priv->root_inode = inode_ref (inode); + priv->first_lookup = 0; + } +out: + return; +} - if (priv->read_child >= 0) { - afr_set_read_child (this, - local->cont.lookup.inode, - priv->read_child); - } else { - afr_set_read_child (this, - local->cont.lookup.inode, - local->read_child_index); - } - } +static void +afr_lookup_cache_args (afr_local_t *local, int child_index, dict_t *xattr, + struct iatt *buf, struct iatt *postparent) +{ + GF_ASSERT (child_index >= 0); + local->cont.lookup.xattrs[child_index] = dict_ref (xattr); + local->cont.lookup.postparents[child_index] = *postparent; + local->cont.lookup.bufs[child_index] = *buf; +} + +static void +afr_lookup_handle_first_success (afr_local_t *local, xlator_t *this, + inode_t *inode, struct iatt *buf) +{ + local->cont.lookup.inode = inode_ref (inode); + local->cont.lookup.buf = *buf; + afr_set_root_inode_on_first_lookup (local, this, inode); +} + +static int32_t +afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + int ret = 0; + char *pathinfo = NULL; + gf_boolean_t is_local = _gf_false; + afr_private_t *priv = NULL; + int32_t child_index = -1; + + if (op_ret != 0) { + goto out; + } + ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); + if (ret != 0) { + goto out; + } + + ret = afr_local_pathinfo (pathinfo, &is_local); + if (ret) { + goto out; + } + + priv = this->private; + /* + * Note that one local subvolume will override another here. The only + * way to avoid that would be to retain extra information about whether + * the previous read_child is local, and it's just not worth it. Even + * the slowest local subvolume is far preferable to a remote one. + */ + if (is_local) { + child_index = (int32_t)(long)cookie; + gf_log (this->name, GF_LOG_INFO, + "selecting local read_child %s", + priv->children[child_index]->name); + priv->read_child = child_index; + } + +out: + STACK_DESTROY(frame->root); + return 0; +} + +static void +afr_attempt_local_discovery (xlator_t *this, int32_t child_index) +{ + call_frame_t *newframe = NULL; + loc_t tmploc = {0,}; + afr_private_t *priv = this->private; + + newframe = create_frame(this,this->ctx->pool); + if (!newframe) { + return; + } + + tmploc.gfid[sizeof(tmploc.gfid)-1] = 1; + STACK_WIND_COOKIE (newframe, afr_discovery_cbk, + (void *)(long)child_index, + priv->children[child_index], + priv->children[child_index]->fops->getxattr, + &tmploc, GF_XATTR_PATHINFO_KEY, NULL); +} + +static void +afr_lookup_handle_success (afr_local_t *local, xlator_t *this, int32_t child_index, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xattr, + struct iatt *postparent) +{ + afr_private_t *priv = this->private; + + if (local->success_count == 0) { + if (local->op_errno != ESTALE) { + local->op_ret = op_ret; + local->op_errno = 0; } + afr_lookup_handle_first_success (local, this, inode, buf); + } + afr_lookup_update_lk_counts (local, this, + child_index, xattr); + + afr_lookup_cache_args (local, child_index, xattr, + buf, postparent); - local->success_count++; + if (local->do_discovery && (priv->read_child == (-1))) { + afr_attempt_local_discovery(this,child_index); } + + local->cont.lookup.success_children[local->success_count] = child_index; + local->success_count++; +} + +int +afr_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *buf, dict_t *xattr, + struct iatt *postparent) +{ + afr_local_t * local = NULL; + int call_count = -1; + int child_index = -1; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + local = frame->local; + + if (op_ret == -1) { + afr_lookup_handle_error (local, op_ret, op_errno); + goto unlock; + } + afr_lookup_handle_success (local, this, child_index, op_ret, + op_errno, inode, buf, xattr, + postparent); + + } unlock: UNLOCK (&frame->lock); call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_lookup_done (frame, this, lookup_buf); + afr_lookup_done (frame, this); } - return 0; + return 0; } - int -afr_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) +afr_lookup_cont_init (afr_local_t *local, unsigned int child_count) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int ret = -1; - int i = 0; + int ret = -ENOMEM; + struct iatt *iatts = NULL; + int32_t *success_children = NULL; + int32_t *sources = NULL; + int32_t **pending_matrix = NULL; + + GF_ASSERT (local); + local->cont.lookup.xattrs = GF_CALLOC (child_count, + sizeof (*local->cont.lookup.xattr), + gf_afr_mt_dict_t); + if (NULL == local->cont.lookup.xattrs) + goto out; - fop_lookup_cbk_t callback; + iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); + if (NULL == iatts) + goto out; + local->cont.lookup.postparents = iatts; - int call_count = 0; + iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); + if (NULL == iatts) + goto out; + local->cont.lookup.bufs = iatts; + + success_children = afr_children_create (child_count); + if (NULL == success_children) + goto out; + local->cont.lookup.success_children = success_children; + + local->fresh_children = afr_children_create (child_count); + if (NULL == local->fresh_children) + goto out; - uint64_t ctx; + sources = GF_CALLOC (sizeof (*sources), child_count, gf_afr_mt_int32_t); + if (NULL == sources) + goto out; + local->cont.lookup.sources = sources; - int32_t op_errno = 0; + pending_matrix = afr_matrix_create (child_count, child_count); + if (NULL == pending_matrix) + goto out; + local->cont.lookup.pending_matrix = pending_matrix; + ret = 0; +out: + return ret; +} + +int +afr_lookup (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *xattr_req) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + void *gfid_req = NULL; + int ret = -1; + int i = 0; + int call_count = 0; + uint64_t ctx = 0; + int32_t op_errno = 0; + int allow_sh = 0; priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (local, out); local->op_ret = -1; frame->local = local; + local->fop = GF_FOP_LOOKUP; - if (!strcmp (loc->path, "/" GF_REPLICATE_TRASH_DIR)) { - op_errno = ENOENT; + loc_copy (&local->loc, loc); + ret = loc_path (&local->loc, NULL); + if (ret < 0) { + op_errno = EINVAL; goto out; } - loc_copy (&local->loc, loc); + if (local->loc.path && + (strcmp (local->loc.path, "/" GF_REPLICATE_TRASH_DIR) == 0)) { + op_errno = EPERM; + ret = -1; + goto out; + } - ret = inode_ctx_get (loc->inode, this, &ctx); + ret = inode_ctx_get (local->loc.inode, this, &ctx); if (ret == 0) { /* lookup is a revalidate */ - callback = afr_revalidate_lookup_cbk; - - local->cont.lookup.is_revalidate = _gf_true; - local->read_child_index = afr_read_child (this, - loc->inode); + local->read_child_index = afr_inode_get_read_ctx (this, + local->loc.inode, + NULL); } else { - callback = afr_fresh_lookup_cbk; - LOCK (&priv->read_child_lock); { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); + if (priv->hash_mode) { + local->read_child_index = -1; + } + else { + local->read_child_index = + (++priv->read_child_rr) % + (priv->child_count); + } } UNLOCK (&priv->read_child_lock); + local->cont.lookup.fresh_lookup = _gf_true; } - if (loc->parent) - local->cont.lookup.parent_ino = loc->parent->ino; - - local->child_up = memdup (priv->child_up, priv->child_count); + local->child_up = memdup (priv->child_up, + sizeof (*local->child_up) * priv->child_count); + if (NULL == local->child_up) { + op_errno = ENOMEM; + goto out; + } - local->cont.lookup.xattrs = GF_CALLOC (priv->child_count, - sizeof (*local->cont.lookup.xattr), - gf_afr_mt_dict_t); + ret = afr_lookup_cont_init (local, priv->child_count); + if (ret < 0) { + op_errno = -ret; + goto out; + } - local->call_count = afr_up_children_count (priv->child_count, - local->child_up); + local->call_count = afr_up_children_count (local->child_up, + priv->child_count); call_count = local->call_count; - if (local->call_count == 0) { ret = -1; op_errno = ENOTCONN; @@ -1083,38 +2500,33 @@ afr_lookup (call_frame_t *frame, xlator_t *this, /* By default assume ENOTCONN. On success it will be set to 0. */ local->op_errno = ENOTCONN; - if (xattr_req == NULL) - local->xattr_req = dict_new (); - else - local->xattr_req = dict_ref (xattr_req); - - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_uint64 (local->xattr_req, priv->pending_key[i], - 3 * sizeof(int32_t)); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); - /* 3 = data+metadata+entry */ - } + ret = dict_get_int32 (xattr_req, "allow-sh-for-running-transaction", + &allow_sh); + dict_del (xattr_req, "allow-sh-for-running-transaction"); + local->allow_sh_for_running_transaction = allow_sh; - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); + ret = afr_lookup_xattr_req_prepare (local, this, xattr_req, &local->loc, + &gfid_req); + if (ret) { + local->op_errno = -ret; + goto out; } - - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); + afr_lookup_save_gfid (local->cont.lookup.gfid_req, gfid_req, + &local->loc); + local->fop = GF_FOP_LOOKUP; + if (priv->choose_local && !priv->did_discovery) { + if (gfid_req && __is_root_gfid(gfid_req)) { + local->do_discovery = _gf_true; + priv->did_discovery = _gf_true; + } } - for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, callback, (void *) (long) i, + STACK_WIND_COOKIE (frame, afr_lookup_cbk, + (void *) (long) i, priv->children[i], priv->children[i]->fops->lookup, - loc, local->xattr_req); + &local->loc, local->xattr_req); if (!--call_count) break; } @@ -1122,7 +2534,7 @@ afr_lookup (call_frame_t *frame, xlator_t *this, ret = 0; out: - if (ret == -1) + if (ret) AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); @@ -1133,231 +2545,179 @@ out: /* {{{ open */ int -afr_fd_ctx_set (xlator_t *this, fd_t *fd) +__afr_fd_ctx_set (xlator_t *this, fd_t *fd) { - afr_private_t * priv = NULL; - - int ret = 0; - - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; + afr_private_t * priv = NULL; + int ret = -1; + uint64_t ctx = 0; + afr_fd_ctx_t * fd_ctx = NULL; VALIDATE_OR_GOTO (this->private, out); VALIDATE_OR_GOTO (fd, out); priv = this->private; - LOCK (&fd->lock); - { - ret = __fd_ctx_get (fd, this, &ctx); + ret = __fd_ctx_get (fd, this, &ctx); - if (ret == 0) - goto unlock; - - fd_ctx = GF_CALLOC (1, sizeof (afr_fd_ctx_t), - gf_afr_mt_afr_fd_ctx_t); - if (!fd_ctx) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); + if (ret == 0) + goto out; - ret = -ENOMEM; - goto unlock; - } + fd_ctx = GF_CALLOC (1, sizeof (afr_fd_ctx_t), + gf_afr_mt_afr_fd_ctx_t); + if (!fd_ctx) { + ret = -ENOMEM; + goto out; + } - fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_done) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - ret = -ENOMEM; - goto unlock; - } + fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->pre_op_done) { + ret = -ENOMEM; + goto out; + } - fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_piggyback) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - ret = -ENOMEM; - goto unlock; - } + fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->pre_op_piggyback) { + ret = -ENOMEM; + goto out; + } - fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->opened_on) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - ret = -ENOMEM; - goto unlock; - } + fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), + priv->child_count, + gf_afr_mt_int32_t); + if (!fd_ctx->opened_on) { + ret = -ENOMEM; + goto out; + } - fd_ctx->up_count = priv->up_count; - fd_ctx->down_count = priv->down_count; + fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->lock_piggyback) { + ret = -ENOMEM; + goto out; + } - fd_ctx->locked_on = GF_CALLOC (sizeof (*fd_ctx->locked_on), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->locked_on) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - ret = -ENOMEM; - goto unlock; - } + fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->lock_acquired) { + ret = -ENOMEM; + goto out; + } - ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); + fd_ctx->up_count = priv->up_count; + fd_ctx->down_count = priv->down_count; - INIT_LIST_HEAD (&fd_ctx->entries); + fd_ctx->locked_on = GF_CALLOC (sizeof (*fd_ctx->locked_on), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->locked_on) { + ret = -ENOMEM; + goto out; } -unlock: - UNLOCK (&fd->lock); + + pthread_mutex_init (&fd_ctx->delay_lock, NULL); + INIT_LIST_HEAD (&fd_ctx->entries); + fd_ctx->call_child = -1; + + INIT_LIST_HEAD (&fd_ctx->eager_locked); + + ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); + if (ret) + gf_log (this->name, GF_LOG_DEBUG, + "failed to set fd ctx (%p)", fd); out: return ret; } -/* {{{ flush */ int -afr_flush_unwind (call_frame_t *frame, xlator_t *this) +afr_fd_ctx_set (xlator_t *this, fd_t *fd) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; + int ret = -1; - LOCK (&frame->lock); + LOCK (&fd->lock); { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (flush, main_frame, - local->op_ret, local->op_errno); + ret = __afr_fd_ctx_set (this, fd); } + UNLOCK (&fd->lock); - return 0; + return ret; } +/* {{{ flush */ int -afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int child_index = (long) cookie; - int need_unwind = 0; local = frame->local; - priv = this->private; LOCK (&frame->lock); { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - if (op_ret != -1) { if (local->success_count == 0) { local->op_ret = op_ret; } local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } } local->op_errno = op_errno; } UNLOCK (&frame->lock); - if (need_unwind) - afr_flush_unwind (frame, this); - - call_count = afr_frame_return (frame); + call_count = afr_frame_return (frame); - if (call_count == 0) { - local->transaction.resume (frame, this); - } + if (call_count == 0) + AFR_STACK_UNWIND(flush, frame, local->op_ret, + local->op_errno, NULL); return 0; } - -int -afr_flush_wind (call_frame_t *frame, xlator_t *this) +static int +afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int i = 0; - int call_count = -1; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; - local = frame->local; priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; + local = frame->local; + call_count = local->call_count; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_flush_wind_cbk, + STACK_WIND_COOKIE (frame, afr_flush_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->flush, - local->fd); - + local->fd, NULL); if (!--call_count) break; + } } return 0; } - -int -afr_flush_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - - int -afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - - call_frame_t * transaction_frame = NULL; - - int ret = -1; - - int op_ret = -1; - int op_errno = 0; - - int call_count = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_stub_t *stub = NULL; + int ret = -1; + int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -1365,49 +2725,27 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - call_count = afr_up_children_count (priv->child_count, local->child_up); + ret = afr_local_init(local, priv, &op_errno); + if (ret < 0) + goto out; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { + local->fd = fd_ref(fd); + stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata); + if (!stub) { + ret = -1; op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); goto out; } - transaction_frame->local = local; - - local->op = GF_FOP_FLUSH; - - local->transaction.fop = afr_flush_wind; - local->transaction.done = afr_flush_done; - local->transaction.unwind = afr_flush_unwind; + afr_delayed_changelog_wake_resume (this, fd, stub); + ret = 0; - local->fd = fd_ref (fd); - - local->transaction.main_frame = frame; - local->transaction.start = 0; - local->transaction.len = 0; - - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - - - op_ret = 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - - AFR_STACK_UNWIND (flush, frame, op_ret, op_errno); - } + if (ret < 0) + AFR_STACK_UNWIND(flush, frame, -1, op_errno, NULL); return 0; } @@ -1423,21 +2761,24 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) int ret = 0; ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) goto out; fd_ctx = (afr_fd_ctx_t *)(long) ctx; if (fd_ctx) { - if (fd_ctx->pre_op_done) - GF_FREE (fd_ctx->pre_op_done); + GF_FREE (fd_ctx->pre_op_done); + + GF_FREE (fd_ctx->opened_on); - if (fd_ctx->opened_on) - GF_FREE (fd_ctx->opened_on); + GF_FREE (fd_ctx->locked_on); - if (fd_ctx->locked_on) - GF_FREE (fd_ctx->locked_on); + GF_FREE (fd_ctx->pre_op_piggyback); + GF_FREE (fd_ctx->lock_piggyback); + + GF_FREE (fd_ctx->lock_acquired); + + pthread_mutex_destroy (&fd_ctx->delay_lock); GF_FREE (fd_ctx); } @@ -1475,20 +2816,29 @@ afr_release (xlator_t *this, fd_t *fd) /* {{{ fsync */ int +afr_fsync_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} + +int afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { afr_local_t *local = NULL; - int call_count = -1; - int child_index = (long) cookie; int read_child = 0; + call_stub_t *stub = NULL; local = frame->local; - read_child = afr_read_child (this, local->fd->inode); + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); LOCK (&frame->lock); { @@ -1500,13 +2850,13 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_ret = 0; if (local->success_count == 0) { - local->cont.fsync.prebuf = *prebuf; - local->cont.fsync.postbuf = *postbuf; + local->cont.inode_wfop.prebuf = *prebuf; + local->cont.inode_wfop.postbuf = *postbuf; } if (child_index == read_child) { - local->cont.fsync.prebuf = *prebuf; - local->cont.fsync.postbuf = *postbuf; + local->cont.inode_wfop.prebuf = *prebuf; + local->cont.inode_wfop.postbuf = *postbuf; } local->success_count++; @@ -1519,12 +2869,32 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { - local->cont.fsync.prebuf.ia_ino = local->cont.fsync.ino; - local->cont.fsync.postbuf.ia_ino = local->cont.fsync.ino; - - AFR_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno, - &local->cont.fsync.prebuf, - &local->cont.fsync.postbuf); + /* Make a stub out of the frame, and register it + with the waking up post-op. When the call-stub resumes, + we are guaranteed that there was no post-op pending + (i.e changelogs were unset in the server). This is an + essential "guarantee", that fsync() returns only after + completely finishing EVERYTHING, including the delayed + post-op. This guarantee is expected by FUSE graph switching + for example. + */ + stub = fop_fsync_cbk_stub (frame, afr_fsync_unwind_cbk, + local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + xdata); + if (!stub) { + AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); + return 0; + } + + /* If no new unstable writes happened between the + time we cleared the unstable write witness flag in afr_fsync + and now, calling afr_delayed_changelog_wake_up() should + wake up and skip over the fsync phase and go straight to + afr_changelog_post_op_now() + */ + afr_delayed_changelog_wake_resume (this, local->fd, stub); } return 0; @@ -1533,16 +2903,13 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync) + int32_t datasync, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; - int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -1551,19 +2918,20 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; local->fd = fd_ref (fd); - local->cont.fsync.ino = fd->inode->ino; + + if (afr_fd_has_witnessed_unstable_write (this, fd)) { + /* don't care. we only wanted to CLEAR the bit */ + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -1571,17 +2939,16 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, (void *) (long) i, priv->children[i], priv->children[i]->fops->fsync, - fd, datasync); + fd, datasync, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, NULL, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -1591,10 +2958,10 @@ out: int32_t afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xdata) { afr_local_t *local = NULL; - int call_count = -1; local = frame->local; @@ -1612,7 +2979,7 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } @@ -1620,16 +2987,13 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, int32_t afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync) + int32_t datasync, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; - int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -1638,33 +3002,30 @@ afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_fsyncdir_cbk, priv->children[i], priv->children[i]->fops->fsyncdir, - fd, datasync); + fd, datasync, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fsyncdir, frame, op_ret, op_errno); - } + if (ret < 0) + AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); return 0; } @@ -1675,18 +3036,20 @@ out: int32_t afr_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) + dict_t *xattr, dict_t *xdata) { afr_local_t *local = NULL; - int call_count = -1; local = frame->local; LOCK (&frame->lock); { - if (op_ret == 0) + if (op_ret == 0) { + if (!local->cont.xattrop.xattr) + local->cont.xattrop.xattr = dict_ref (xattr); local->op_ret = 0; + } local->op_errno = op_errno; } @@ -1696,7 +3059,7 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (xattrop, frame, local->op_ret, local->op_errno, - xattr); + local->cont.xattrop.xattr, xdata); return 0; } @@ -1704,16 +3067,13 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, int32_t afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t optype, dict_t *xattr) + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; - int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -1722,33 +3082,30 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_xattrop_cbk, priv->children[i], priv->children[i]->fops->xattrop, - loc, optype, xattr); + loc, optype, xattr, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (xattrop, frame, op_ret, op_errno, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); return 0; } @@ -1759,7 +3116,7 @@ out: int32_t afr_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) + dict_t *xattr, dict_t *xdata) { afr_local_t *local = NULL; @@ -1769,8 +3126,12 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, LOCK (&frame->lock); { - if (op_ret == 0) + if (op_ret == 0) { + if (!local->cont.fxattrop.xattr) + local->cont.fxattrop.xattr = dict_ref (xattr); + local->op_ret = 0; + } local->op_errno = op_errno; } @@ -1780,7 +3141,7 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fxattrop, frame, local->op_ret, local->op_errno, - xattr); + local->cont.fxattrop.xattr, xdata); return 0; } @@ -1788,16 +3149,13 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, int32_t afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_xattrop_flags_t optype, dict_t *xattr) + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; - int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -1806,33 +3164,30 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_fxattrop_cbk, priv->children[i], priv->children[i]->fops->fxattrop, - fd, optype, xattr); + fd, optype, xattr, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); return 0; } @@ -1841,11 +3196,10 @@ out: int32_t afr_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; - int call_count = -1; local = frame->local; @@ -1863,7 +3217,7 @@ afr_inodelk_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (inodelk, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } @@ -1871,16 +3225,14 @@ afr_inodelk_cbk (call_frame_t *frame, void *cookie, int32_t afr_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock) + const char *volume, loc_t *loc, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; - int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -1889,45 +3241,42 @@ afr_inodelk (call_frame_t *frame, xlator_t *this, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_inodelk_cbk, priv->children[i], priv->children[i]->fops->inodelk, - volume, loc, cmd, flock); + volume, loc, cmd, flock, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (inodelk, frame, op_ret, op_errno); - } + if (ret < 0) + AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); return 0; } int32_t afr_finodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xdata) { afr_local_t *local = NULL; - int call_count = -1; local = frame->local; @@ -1945,7 +3294,7 @@ afr_finodelk_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (finodelk, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } @@ -1953,16 +3302,14 @@ afr_finodelk_cbk (call_frame_t *frame, void *cookie, int32_t afr_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock) + const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; - int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -1971,45 +3318,40 @@ afr_finodelk (call_frame_t *frame, xlator_t *this, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_finodelk_cbk, priv->children[i], priv->children[i]->fops->finodelk, - volume, fd, cmd, flock); + volume, fd, cmd, flock, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (finodelk, frame, op_ret, op_errno); - } + if (ret < 0) + AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); return 0; } int32_t -afr_entrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - +afr_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; - int call_count = -1; local = frame->local; @@ -2027,7 +3369,7 @@ afr_entrylk_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (entrylk, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } @@ -2036,16 +3378,14 @@ afr_entrylk_cbk (call_frame_t *frame, void *cookie, int32_t afr_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, - const char *basename, entrylk_cmd cmd, entrylk_type type) + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; - int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -2054,34 +3394,31 @@ afr_entrylk (call_frame_t *frame, xlator_t *this, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_entrylk_cbk, priv->children[i], priv->children[i]->fops->entrylk, - volume, loc, basename, cmd, type); + volume, loc, basename, cmd, type, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (entrylk, frame, op_ret, op_errno); - } + if (ret < 0) + AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); return 0; } @@ -2089,11 +3426,10 @@ out: int32_t afr_fentrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; - int call_count = -1; local = frame->local; @@ -2111,7 +3447,7 @@ afr_fentrylk_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fentrylk, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } @@ -2120,16 +3456,14 @@ afr_fentrylk_cbk (call_frame_t *frame, void *cookie, int32_t afr_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, - const char *basename, entrylk_cmd cmd, entrylk_type type) + const char *basename, entrylk_cmd cmd, + entrylk_type type, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; - int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -2138,44 +3472,40 @@ afr_fentrylk (call_frame_t *frame, xlator_t *this, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } call_count = local->call_count; - frame->local = local; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_fentrylk_cbk, priv->children[i], priv->children[i]->fops->fentrylk, - volume, fd, basename, cmd, type); + volume, fd, basename, cmd, type, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fentrylk, frame, op_ret, op_errno); - } + if (ret < 0) + AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); return 0; } int32_t afr_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct statvfs *statvfs) + struct statvfs *statvfs, dict_t *xdata) { afr_local_t *local = NULL; - int call_count = 0; LOCK (&frame->lock); @@ -2204,7 +3534,7 @@ afr_statfs_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno, - &local->cont.statfs.buf); + &local->cont.statfs.buf, xdata); return 0; } @@ -2212,16 +3542,14 @@ afr_statfs_cbk (call_frame_t *frame, void *cookie, int32_t afr_statfs (call_frame_t *frame, xlator_t *this, - loc_t *loc) + loc_t *loc, dict_t *xdata) { afr_private_t * priv = NULL; int child_count = 0; afr_local_t * local = NULL; int i = 0; - - int ret = -1; + int ret = -1; int call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; VALIDATE_OR_GOTO (this, out); @@ -2231,15 +3559,13 @@ afr_statfs (call_frame_t *frame, xlator_t *this, priv = this->private; child_count = priv->child_count; - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - frame->local = local; call_count = local->call_count; for (i = 0; i < child_count; i++) { @@ -2247,27 +3573,26 @@ afr_statfs (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, afr_statfs_cbk, priv->children[i], priv->children[i]->fops->statfs, - loc); + loc, xdata); if (!--call_count) break; } } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (statfs, frame, op_ret, op_errno, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); return 0; } int32_t afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) { afr_local_t * local = NULL; - int call_count = -1; local = frame->local; @@ -2275,7 +3600,7 @@ afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (call_count == 0) AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, - lock); + lock, xdata); return 0; } @@ -2286,8 +3611,7 @@ afr_lk_unlock (call_frame_t *frame, xlator_t *this) { afr_local_t * local = NULL; afr_private_t * priv = NULL; - - int i; + int i = 0; int call_count = 0; local = frame->local; @@ -2298,7 +3622,7 @@ afr_lk_unlock (call_frame_t *frame, xlator_t *this) if (call_count == 0) { AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, - &local->cont.lk.ret_flock); + &local->cont.lk.ret_flock, NULL); return 0; } @@ -2312,7 +3636,7 @@ afr_lk_unlock (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->lk, local->fd, F_SETLK, - &local->cont.lk.user_flock); + &local->cont.lk.user_flock, NULL); if (!--call_count) break; @@ -2325,13 +3649,13 @@ afr_lk_unlock (call_frame_t *frame, xlator_t *this) int32_t afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int ret = 0; - + afr_local_t *local = NULL; + afr_private_t *priv = NULL; int child_index = -1; +/* int ret = 0; */ + local = frame->local; priv = this->private; @@ -2360,28 +3684,30 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, priv->children[child_index], priv->children[child_index]->fops->lk, local->fd, local->cont.lk.cmd, - &local->cont.lk.user_flock); + &local->cont.lk.user_flock, xdata); } else if (local->op_ret == -1) { /* all nodes have gone down */ AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN, - &local->cont.lk.ret_flock); + &local->cont.lk.ret_flock, NULL); } else { /* locking has succeeded on all nodes that are up */ - ret = afr_mark_locked_nodes (this, local->fd, - local->cont.lk.locked_nodes); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "Could not save locked nodes info in fdctx"); + /* temporarily + ret = afr_mark_locked_nodes (this, local->fd, + local->cont.lk.locked_nodes); + if (ret) + gf_log (this->name, GF_LOG_DEBUG, + "Could not save locked nodes info in fdctx"); - ret = afr_save_locked_fd (this, local->fd); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "Could not save locked fd"); + ret = afr_save_locked_fd (this, local->fd); + if (ret) + gf_log (this->name, GF_LOG_DEBUG, + "Could not save locked fd"); - AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, - &local->cont.lk.ret_flock); + */ + AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, + &local->cont.lk.ret_flock, NULL); } return 0; @@ -2390,16 +3716,13 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int afr_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t cmd, - struct gf_flock *flock) + fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int i = 0; - - int32_t op_ret = -1; int32_t op_errno = 0; + int ret = -1; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -2407,17 +3730,18 @@ afr_lk (call_frame_t *frame, xlator_t *this, priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - AFR_LOCAL_INIT (local, priv); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - frame->local = local; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count, - sizeof (*local->cont.lk.locked_nodes), - gf_afr_mt_char); + sizeof (*local->cont.lk.locked_nodes), + gf_afr_mt_char); if (!local->cont.lk.locked_nodes) { - gf_log (this->name, GF_LOG_ERROR, "Out of memory"); op_errno = ENOMEM; goto out; } @@ -2430,13 +3754,30 @@ afr_lk (call_frame_t *frame, xlator_t *this, STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0, priv->children[i], priv->children[i]->fops->lk, - fd, cmd, flock); + fd, cmd, flock, xdata); - op_ret = 0; + ret = 0; +out: + if (ret < 0) + AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); + return 0; +} + +int +afr_forget (xlator_t *this, inode_t *inode) +{ + uint64_t ctx_addr = 0; + afr_inode_ctx_t *ctx = NULL; + + inode_ctx_get (inode, this, &ctx_addr); + + if (!ctx_addr) + goto out; + + ctx = (afr_inode_ctx_t *)(long)ctx_addr; + GF_FREE (ctx->fresh_children); + GF_FREE (ctx); out: - if (op_ret == -1) { - AFR_STACK_UNWIND (lk, frame, op_ret, op_errno, NULL); - } return 0; } @@ -2449,47 +3790,29 @@ afr_priv_dump (xlator_t *this) int i = 0; - assert(this); + GF_ASSERT (this); priv = this->private; - assert(priv); + GF_ASSERT (priv); snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); gf_proc_dump_add_section(key_prefix); - gf_proc_dump_build_key(key, key_prefix, "child_count"); - gf_proc_dump_write(key, "%u", priv->child_count); - gf_proc_dump_build_key(key, key_prefix, "read_child_rr"); - gf_proc_dump_write(key, "%u", priv->read_child_rr); + gf_proc_dump_write("child_count", "%u", priv->child_count); + gf_proc_dump_write("read_child_rr", "%u", priv->read_child_rr); for (i = 0; i < priv->child_count; i++) { - gf_proc_dump_build_key(key, key_prefix, "child_up[%d]", i); + sprintf (key, "child_up[%d]", i); gf_proc_dump_write(key, "%d", priv->child_up[i]); - gf_proc_dump_build_key(key, key_prefix, - "pending_key[%d]", i); + sprintf (key, "pending_key[%d]", i); gf_proc_dump_write(key, "%s", priv->pending_key[i]); } - gf_proc_dump_build_key(key, key_prefix, "data_self_heal"); - gf_proc_dump_write(key, "%d", priv->data_self_heal); - gf_proc_dump_build_key(key, key_prefix, "metadata_self_heal"); - gf_proc_dump_write(key, "%d", priv->metadata_self_heal); - gf_proc_dump_build_key(key, key_prefix, "entry_self_heal"); - gf_proc_dump_write(key, "%d", priv->entry_self_heal); - gf_proc_dump_build_key(key, key_prefix, "data_change_log"); - gf_proc_dump_write(key, "%d", priv->data_change_log); - gf_proc_dump_build_key(key, key_prefix, "metadata_change_log"); - gf_proc_dump_write(key, "%d", priv->metadata_change_log); - gf_proc_dump_build_key(key, key_prefix, "entry_change_log"); - gf_proc_dump_write(key, "%d", priv->entry_change_log); - gf_proc_dump_build_key(key, key_prefix, "read_child"); - gf_proc_dump_write(key, "%d", priv->read_child); - gf_proc_dump_build_key(key, key_prefix, "favorite_child"); - gf_proc_dump_write(key, "%u", priv->favorite_child); - gf_proc_dump_build_key(key, key_prefix, "data_lock_server_count"); - gf_proc_dump_write(key, "%u", priv->data_lock_server_count); - gf_proc_dump_build_key(key, key_prefix, "metadata_lock_server_count"); - gf_proc_dump_write(key, "%u", priv->metadata_lock_server_count); - gf_proc_dump_build_key(key, key_prefix, "entry_lock_server_count"); - gf_proc_dump_write(key, "%u", priv->entry_lock_server_count); - gf_proc_dump_build_key(key, key_prefix, "wait_count"); - gf_proc_dump_write(key, "%u", priv->wait_count); + gf_proc_dump_write("data_self_heal", "%s", priv->data_self_heal); + gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal); + gf_proc_dump_write("entry_self_heal", "%d", priv->entry_self_heal); + gf_proc_dump_write("data_change_log", "%d", priv->data_change_log); + gf_proc_dump_write("metadata_change_log", "%d", priv->metadata_change_log); + gf_proc_dump_write("entry-change_log", "%d", priv->entry_change_log); + gf_proc_dump_write("read_child", "%d", priv->read_child); + gf_proc_dump_write("favorite_child", "%d", priv->favorite_child); + gf_proc_dump_write("wait_count", "%u", priv->wait_count); return 0; } @@ -2505,7 +3828,6 @@ static int find_child_index (xlator_t *this, xlator_t *child) { afr_private_t *priv = NULL; - int i = -1; priv = this->private; @@ -2520,91 +3842,750 @@ find_child_index (xlator_t *this, xlator_t *child) int32_t afr_notify (xlator_t *this, int32_t event, - void *data, ...) + void *data, void *data2) { - afr_private_t * priv = NULL; - unsigned char * child_up = NULL; - - int i = -1; - int up_children = 0; + afr_private_t *priv = NULL; + int i = -1; + int up_children = 0; + int down_children = 0; + int propagate = 0; + int had_heard_from_all = 0; + int have_heard_from_all = 0; + int idx = -1; + int ret = -1; + int call_psh = 0; + int up_child = AFR_ALL_CHILDREN; + dict_t *input = NULL; + dict_t *output = NULL; priv = this->private; if (!priv) return 0; - child_up = priv->child_up; + /* + * We need to reset this in case children come up in "staggered" + * fashion, so that we discover a late-arriving local subvolume. Note + * that we could end up issuing N lookups to the first subvolume, and + * O(N^2) overall, but N is small for AFR so it shouldn't be an issue. + */ + priv->did_discovery = _gf_false; + + had_heard_from_all = 1; + for (i = 0; i < priv->child_count; i++) { + if (!priv->last_event[i]) { + had_heard_from_all = 0; + } + } + + /* parent xlators dont need to know about every child_up, child_down + * because of afr ha. If all subvolumes go down, child_down has + * to be triggered. In that state when 1 subvolume comes up child_up + * needs to be triggered. dht optimizes revalidate lookup by sending + * it only to one of its subvolumes. When child up/down happens + * for afr's subvolumes dht should be notified by child_modified. The + * subsequent revalidate lookup happens on all the dht's subvolumes + * which triggers afr self-heals if any. + */ + idx = find_child_index (this, data); + if (idx < 0) { + gf_log (this->name, GF_LOG_ERROR, "Received child_up " + "from invalid subvolume"); + goto out; + } switch (event) { case GF_EVENT_CHILD_UP: - i = find_child_index (this, data); - - gf_log (this->name, GF_LOG_DEBUG, - "child=%d up", i); + LOCK (&priv->lock); + { + /* + * This only really counts if the child was never up + * (value = -1) or had been down (value = 0). See + * comment at GF_EVENT_CHILD_DOWN for a more detailed + * explanation. + */ + if (priv->child_up[idx] != 1) { + priv->up_count++; + } + priv->child_up[idx] = 1; + + call_psh = 1; + up_child = idx; + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 1) + up_children++; + if (up_children == 1) { + gf_log (this->name, GF_LOG_INFO, + "Subvolume '%s' came back up; " + "going online.", ((xlator_t *)data)->name); + } else { + event = GF_EVENT_CHILD_MODIFIED; + } - afr_attempt_lock_recovery (this, i); + priv->last_event[idx] = event; + } + UNLOCK (&priv->lock); - child_up[i] = 1; + break; + case GF_EVENT_CHILD_DOWN: LOCK (&priv->lock); { - priv->up_count++; + /* + * If a brick is down when we start, we'll get a + * CHILD_DOWN to indicate its initial state. There + * was never a CHILD_UP in this case, so if we + * increment "down_count" the difference between than + * and "up_count" will no longer be the number of + * children that are currently up. This has serious + * implications e.g. for quorum enforcement, so we + * don't increment these values unless the event + * represents an actual state transition between "up" + * (value = 1) and anything else. + */ + if (priv->child_up[idx] == 1) { + priv->down_count++; + } + priv->child_up[idx] = 0; + + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 0) + down_children++; + if (down_children == priv->child_count) { + gf_log (this->name, GF_LOG_ERROR, + "All subvolumes are down. Going offline " + "until atleast one of them comes back up."); + } else { + event = GF_EVENT_CHILD_MODIFIED; + } + + priv->last_event[idx] = event; } UNLOCK (&priv->lock); - /* - if all the children were down, and one child came up, - send notify to parent - */ + break; - for (i = 0; i < priv->child_count; i++) - if (child_up[i]) - up_children++; + case GF_EVENT_CHILD_CONNECTING: + LOCK (&priv->lock); + { + priv->last_event[idx] = event; + } + UNLOCK (&priv->lock); - if (up_children == 1) { - gf_log (this->name, GF_LOG_NORMAL, - "Subvolume '%s' came back up; " - "going online.", ((xlator_t *)data)->name); + break; - default_notify (this, event, data); - } + case GF_EVENT_TRANSLATOR_OP: + input = data; + output = data2; + ret = afr_xl_op (this, input, output); + goto out; + break; + default: + propagate = 1; break; + } - case GF_EVENT_CHILD_DOWN: - i = find_child_index (this, data); + /* have all subvolumes reported status once by now? */ + have_heard_from_all = 1; + for (i = 0; i < priv->child_count; i++) { + if (!priv->last_event[i]) + have_heard_from_all = 0; + } - child_up[i] = 0; + /* if all subvols have reported status, no need to hide anything + or wait for anything else. Just propagate blindly */ + if (have_heard_from_all) + propagate = 1; + + if (!had_heard_from_all && have_heard_from_all) { + /* This is the first event which completes aggregation + of events from all subvolumes. If at least one subvol + had come up, propagate CHILD_UP, but only this time + */ + event = GF_EVENT_CHILD_DOWN; LOCK (&priv->lock); { - priv->down_count++; + up_children = afr_up_children_count (priv->child_up, + priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (priv->last_event[i] == GF_EVENT_CHILD_UP) { + event = GF_EVENT_CHILD_UP; + break; + } + + if (priv->last_event[i] == + GF_EVENT_CHILD_CONNECTING) { + event = GF_EVENT_CHILD_CONNECTING; + /* continue to check other events for CHILD_UP */ + } + } } UNLOCK (&priv->lock); + } - /* - if all children are down, and this was the last to go down, - send notify to parent - */ + ret = 0; + if (propagate) + ret = default_notify (this, event, data); + if (call_psh && priv->shd.iamshd) + afr_proactive_self_heal ((void*) (long) up_child); + +out: + return ret; +} + +int +afr_first_up_child (unsigned char *child_up, size_t child_count) +{ + int ret = -1; + int i = 0; + + GF_ASSERT (child_up); + + for (i = 0; i < child_count; i++) { + if (child_up[i]) { + ret = i; + break; + } + } + + return ret; +} + +int +afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) +{ + int ret = -1; + + local->op_ret = -1; + local->op_errno = EUCLEAN; + + local->child_up = GF_CALLOC (priv->child_count, + sizeof (*local->child_up), + gf_afr_mt_char); + if (!local->child_up) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + memcpy (local->child_up, priv->child_up, + sizeof (*local->child_up) * priv->child_count); + local->call_count = afr_up_children_count (local->child_up, + priv->child_count); + if (local->call_count == 0) { + gf_log (THIS->name, GF_LOG_INFO, "no subvolumes up"); + if (op_errno) + *op_errno = ENOTCONN; + goto out; + } + + local->child_errno = GF_CALLOC (priv->child_count, + sizeof (*local->child_errno), + gf_afr_mt_int32_t); + if (!local->child_errno) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + local->transaction.postop_piggybacked = GF_CALLOC (priv->child_count, + sizeof (int), + gf_afr_mt_int32_t); + if (!local->transaction.postop_piggybacked) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + local->append_write = _gf_false; + + ret = 0; +out: + return ret; +} + +int +afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, + transaction_lk_type_t lk_type) +{ + int ret = -ENOMEM; + + lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), + child_count, gf_afr_mt_char); + if (NULL == lk->locked_nodes) + goto out; + + lk->lower_locked_nodes = GF_CALLOC (sizeof (*lk->lower_locked_nodes), + child_count, gf_afr_mt_char); + if (NULL == lk->lower_locked_nodes) + goto out; + + lk->lock_op_ret = -1; + lk->lock_op_errno = EUCLEAN; + lk->transaction_lk_type = lk_type; + + ret = 0; +out: + return ret; +} + +void +afr_matrix_cleanup (int32_t **matrix, unsigned int m) +{ + int i = 0; + + if (!matrix) + goto out; + for (i = 0; i < m; i++) { + GF_FREE (matrix[i]); + } + + GF_FREE (matrix); +out: + return; +} + +int32_t** +afr_matrix_create (unsigned int m, unsigned int n) +{ + int32_t **matrix = NULL; + int i = 0; + + matrix = GF_CALLOC (sizeof (*matrix), m, gf_afr_mt_int32_t); + if (!matrix) + goto out; + + for (i = 0; i < m; i++) { + matrix[i] = GF_CALLOC (sizeof (*matrix[i]), n, + gf_afr_mt_int32_t); + if (!matrix[i]) + goto out; + } + return matrix; +out: + afr_matrix_cleanup (matrix, m); + return NULL; +} + +int +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count) +{ + int ret = -ENOMEM; + + lk->domain = dom; + lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), + child_count, gf_afr_mt_char); + if (NULL == lk->locked_nodes) + goto out; + ret = 0; +out: + return ret; +} + +int +afr_transaction_local_init (afr_local_t *local, xlator_t *this) +{ + int child_up_count = 0; + int ret = -ENOMEM; + afr_private_t *priv = NULL; + + priv = this->private; + ret = afr_internal_lock_init (&local->internal_lock, priv->child_count, + AFR_TRANSACTION_LK); + if (ret < 0) + goto out; + if ((local->transaction.type == AFR_DATA_TRANSACTION) || + (local->transaction.type == AFR_METADATA_TRANSACTION)) { + ret = afr_inodelk_init (&local->internal_lock.inodelk[0], + this->name, priv->child_count); + if (ret < 0) + goto out; + } + + ret = -ENOMEM; + child_up_count = afr_up_children_count (local->child_up, + priv->child_count); + if (priv->optimistic_change_log && child_up_count == priv->child_count) + local->optimistic_change_log = 1; + + local->first_up_child = afr_first_up_child (local->child_up, + priv->child_count); + + local->transaction.eager_lock = + GF_CALLOC (sizeof (*local->transaction.eager_lock), + priv->child_count, + gf_afr_mt_int32_t); + + if (!local->transaction.eager_lock) + goto out; + + local->fresh_children = afr_children_create (priv->child_count); + if (!local->fresh_children) + goto out; + + local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op), + priv->child_count, + gf_afr_mt_char); + if (!local->transaction.pre_op) + goto out; + + local->pending = afr_matrix_create (priv->child_count, + AFR_NUM_CHANGE_LOGS); + if (!local->pending) + goto out; + + local->transaction.txn_changelog = afr_matrix_create (priv->child_count, + AFR_NUM_CHANGE_LOGS); + if (!local->transaction.txn_changelog) + goto out; + + INIT_LIST_HEAD (&local->transaction.eager_locked); + + ret = 0; +out: + return ret; +} + +void +afr_reset_children (int32_t *fresh_children, int32_t child_count) +{ + unsigned int i = 0; + for (i = 0; i < child_count; i++) + fresh_children[i] = -1; +} + +int32_t* +afr_children_create (int32_t child_count) +{ + int32_t *children = NULL; + int i = 0; + + GF_ASSERT (child_count > 0); + + children = GF_CALLOC (child_count, sizeof (*children), + gf_afr_mt_int32_t); + if (NULL == children) + goto out; + for (i = 0; i < child_count; i++) + children[i] = -1; +out: + return children; +} + +void +afr_children_add_child (int32_t *children, int32_t child, + int32_t child_count) +{ + gf_boolean_t child_found = _gf_false; + int i = 0; + + for (i = 0; i < child_count; i++) { + if (children[i] == -1) + break; + if (children[i] == child) { + child_found = _gf_true; + break; + } + } + + if (!child_found) { + GF_ASSERT (i < child_count); + children[i] = child; + } +} + +void +afr_children_rm_child (int32_t *children, int32_t child, int32_t child_count) +{ + int i = 0; + + GF_ASSERT ((child >= 0) && (child < child_count)); + for (i = 0; i < child_count; i++) { + if (children[i] == -1) + break; + if (children[i] == child) { + if (i != (child_count - 1)) + memmove (children + i, children + i + 1, + sizeof (*children)*(child_count - i - 1)); + children[child_count - 1] = -1; + break; + } + } +} + +int +afr_get_children_count (int32_t *children, unsigned int child_count) +{ + int count = 0; + int i = 0; + + for (i = 0; i < child_count; i++) { + if (children[i] == -1) + break; + count++; + } + return count; +} + +void +afr_set_low_priority (call_frame_t *frame) +{ + frame->root->pid = LOW_PRIO_PROC_PID; +} + +int +afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child, + int flags) +{ + int ret = 0; + uint64_t ctx = 0; + afr_fd_ctx_t *fd_ctx = NULL; + + GF_ASSERT (fd && fd->inode); + ret = afr_fd_ctx_set (this, fd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not set fd ctx for fd=%p", fd); + goto out; + } + + ret = fd_ctx_get (fd, this, &ctx); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not get fd ctx for fd=%p", fd); + goto out; + } + + fd_ctx = (afr_fd_ctx_t *)(long) ctx; + fd_ctx->opened_on[child] = AFR_FD_OPENED; + if (!IA_ISDIR (fd->inode->ia_type)) { + fd_ctx->flags = flags; + } + ret = 0; +out: + return ret; +} + +gf_boolean_t +afr_have_quorum (char *logname, afr_private_t *priv) +{ + unsigned int quorum = 0; + + GF_VALIDATE_OR_GOTO(logname,priv,out); + + quorum = priv->quorum_count; + if (quorum != AFR_QUORUM_AUTO) { + return (priv->up_count >= (priv->down_count + quorum)); + } + + quorum = priv->child_count / 2 + 1; + if (priv->up_count >= (priv->down_count + quorum)) { + return _gf_true; + } + + /* + * Special case for even numbers of nodes: if we have exactly half + * and that includes the first ("senior-most") node, then that counts + * as quorum even if it wouldn't otherwise. This supports e.g. N=2 + * while preserving the critical property that there can only be one + * such group. + */ + if ((priv->child_count % 2) == 0) { + quorum = priv->child_count / 2; + if (priv->up_count >= (priv->down_count + quorum)) { + if (priv->child_up[0]) { + return _gf_true; + } + } + } + +out: + return _gf_false; +} + +void +afr_priv_destroy (afr_private_t *priv) +{ + int i = 0; + + if (!priv) + goto out; + inode_unref (priv->root_inode); + GF_FREE (priv->shd.pos); + GF_FREE (priv->shd.pending); + GF_FREE (priv->shd.inprogress); +// for (i = 0; i < priv->child_count; i++) +// if (priv->shd.timer && priv->shd.timer[i]) +// gf_timer_call_cancel (this->ctx, priv->shd.timer[i]); + GF_FREE (priv->shd.timer); + + if (priv->shd.healed) + eh_destroy (priv->shd.healed); + + if (priv->shd.heal_failed) + eh_destroy (priv->shd.heal_failed); + + if (priv->shd.split_brain) + eh_destroy (priv->shd.split_brain); + + for (i = 0; i < priv->child_count; i++) + { + if (priv->shd.statistics[i]) + eh_destroy (priv->shd.statistics[i]); + } + + GF_FREE (priv->shd.statistics); + + GF_FREE (priv->shd.crawl_events); + + GF_FREE (priv->last_event); + if (priv->pending_key) { for (i = 0; i < priv->child_count; i++) - if (child_up[i]) - up_children++; + GF_FREE (priv->pending_key[i]); + } + GF_FREE (priv->pending_key); + GF_FREE (priv->children); + GF_FREE (priv->child_up); + LOCK_DESTROY (&priv->lock); + LOCK_DESTROY (&priv->read_child_lock); + pthread_mutex_destroy (&priv->mutex); + GF_FREE (priv); +out: + return; +} + +int +xlator_subvolume_count (xlator_t *this) +{ + int i = 0; + xlator_list_t *list = NULL; + + for (list = this->children; list; list = list->next) + i++; + return i; +} + +inline gf_boolean_t +afr_is_errno_set (int *child_errno, int child) +{ + return child_errno[child]; +} - if (up_children == 0) { - gf_log (this->name, GF_LOG_ERROR, - "All subvolumes are down. Going offline " - "until atleast one of them comes back up."); +inline gf_boolean_t +afr_is_errno_unset (int *child_errno, int child) +{ + return !afr_is_errno_set (child_errno, child); +} - default_notify (this, event, data); +void +afr_prepare_new_entry_pending_matrix (int32_t **pending, + gf_boolean_t (*is_pending) (int *, int), + int *ctx, struct iatt *buf, + unsigned int child_count) +{ + int midx = 0; + int idx = 0; + int i = 0; + + midx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); + if (IA_ISDIR (buf->ia_type)) + idx = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); + else if (IA_ISREG (buf->ia_type)) + idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); + else + idx = -1; + for (i = 0; i < child_count; i++) { + if (is_pending (ctx, i)) { + pending[i][midx] = hton32 (1); + if (idx == -1) + continue; + pending[i][idx] = hton32 (1); } + } +} - break; +gf_boolean_t +afr_is_fd_fixable (fd_t *fd) +{ + if (!fd || !fd->inode) + return _gf_false; + else if (fd_is_anonymous (fd)) + return _gf_false; + else if (uuid_is_null (fd->inode->gfid)) + return _gf_false; + + return _gf_true; +} - default: - default_notify (this, event, data); +void +afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + inode_t *inode = NULL; + afr_inode_ctx_t *ctx = NULL; + + local = frame->local; + + if (local->fd) + inode = local->fd->inode; + else + inode = local->loc.inode; + + if (!inode) + return; + + LOCK (&inode->lock); + { + ctx = __afr_inode_ctx_get (inode, this); + ctx->open_fd_count = local->open_fd_count; } + UNLOCK (&inode->lock); +} - return 0; +int +afr_initialise_statistics (xlator_t *this) +{ + afr_private_t *priv = NULL; + int ret = -1; + int i = 0; + int child_count = 0; + eh_t *stats_per_brick = NULL; + shd_crawl_event_t ***shd_crawl_events = NULL; + priv = this->private; + + priv->shd.statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count, + gf_common_mt_eh_t); + if (!priv->shd.statistics) { + ret = -1; + goto out; + } + child_count = priv->child_count; + for (i=0; i < child_count ; i++) { + stats_per_brick = eh_new (AFR_STATISTICS_HISTORY_SIZE, + _gf_false, + _destroy_crawl_event_data); + if (!stats_per_brick) { + ret = -1; + goto out; + } + priv->shd.statistics[i] = stats_per_brick; + + } + + shd_crawl_events = (shd_crawl_event_t***)(&priv->shd.crawl_events); + *shd_crawl_events = GF_CALLOC (sizeof(shd_crawl_event_t*), + priv->child_count, + gf_afr_mt_shd_crawl_event_t); + + if (!priv->shd.crawl_events) { + ret = -1; + goto out; + } + ret = 0; +out: + return ret; } diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index 005f62c57..689dd84e6 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -49,9 +40,9 @@ #include "afr-self-heal.h" #include "afr-self-heal-common.h" - int -afr_examine_dir_sh_unwind (call_frame_t *frame, xlator_t *this) +afr_examine_dir_sh_unwind (call_frame_t *frame, xlator_t *this, int32_t op_ret, + int32_t op_errno, int32_t sh_failed) { afr_local_t *local = NULL; @@ -60,7 +51,7 @@ afr_examine_dir_sh_unwind (call_frame_t *frame, xlator_t *this) afr_set_opendir_done (this, local->fd->inode); AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); + local->op_errno, local->fd, NULL); return 0; } @@ -70,16 +61,19 @@ gf_boolean_t __checksums_differ (uint32_t *checksum, int child_count, unsigned char *child_up) { - int ret = _gf_false; - int i = 0; - - uint32_t cksum; - - cksum = checksum[0]; + int ret = _gf_false; + int i = 0; + uint32_t cksum = 0; + gf_boolean_t activate_check = _gf_false; for (i = 0; i < child_count; i++) { if (!child_up[i]) continue; + if (_gf_false == activate_check) { + cksum = checksum[i]; + activate_check = _gf_true; + continue; + } if (cksum != checksum[i]) { ret = _gf_true; @@ -96,40 +90,45 @@ __checksums_differ (uint32_t *checksum, int child_count, int32_t afr_examine_dir_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) + gf_dirent_t *entries, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; - - int child_index = 0; - - uint32_t entry_cksum; - - int call_count = 0; - off_t last_offset = 0; - char sh_type_str[256] = {0,}; + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t * sh = NULL; + gf_dirent_t * entry = NULL; + gf_dirent_t * tmp = NULL; + char *reason = NULL; + int child_index = 0; + uint32_t entry_cksum = 0; + int call_count = 0; + off_t last_offset = 0; + inode_t *inode = NULL; priv = this->private; local = frame->local; sh = &local->self_heal; + inode = local->fd->inode; child_index = (long) cookie; if (op_ret == -1) { + gf_log (this->name, GF_LOG_INFO, + "%s: failed to do opendir on %s", + local->loc.path, priv->children[child_index]->name); local->op_ret = -1; local->op_ret = op_errno; goto out; } - if (op_ret == 0) + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "%s: no entries found in %s", + local->loc.path, priv->children[child_index]->name); goto out; + } list_for_each_entry_safe (entry, tmp, &entries->list, list) { - entry_cksum = gf_rsync_weak_checksum (entry->d_name, + entry_cksum = gf_rsync_weak_checksum ((unsigned char *)entry->d_name, strlen (entry->d_name)); local->cont.opendir.checksum[child_index] ^= entry_cksum; } @@ -144,39 +143,30 @@ afr_examine_dir_readdir_cbk (call_frame_t *frame, void *cookie, (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->readdir, - local->fd, 131072, last_offset); + local->fd, 131072, last_offset, NULL); + + return 0; out: - if ((op_ret == 0) || (op_ret == -1)) { - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (__checksums_differ (local->cont.opendir.checksum, - priv->child_count, - local->child_up)) { - - sh->need_entry_self_heal = _gf_true; - sh->forced_merge = _gf_true; - sh->type = local->fd->inode->ia_type; - sh->background = _gf_false; - sh->unwind = afr_examine_dir_sh_unwind; - - afr_self_heal_type_str_get(&local->self_heal, - sh_type_str, - sizeof(sh_type_str)); - gf_log (this->name, GF_LOG_NORMAL, - "%s self-heal triggered. path: %s, " - "reason: checksums of directory differ," - " forced merge option set", - sh_type_str, local->loc.path); - - afr_self_heal (frame, this); - } else { - afr_set_opendir_done (this, local->fd->inode); - - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); - } + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (__checksums_differ (local->cont.opendir.checksum, + priv->child_count, + local->child_up)) { + + sh->do_entry_self_heal = _gf_true; + sh->forced_merge = _gf_true; + + reason = "checksums of directory differ"; + afr_launch_self_heal (frame, this, inode, _gf_false, + inode->ia_type, reason, NULL, + afr_examine_dir_sh_unwind); + } else { + afr_set_opendir_done (this, inode); + + AFR_STACK_UNWIND (opendir, frame, local->op_ret, + local->op_errno, local->fd, NULL); } } @@ -187,20 +177,19 @@ out: int afr_examine_dir (call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - - int i; - int call_count = 0; + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + int i = 0; + int call_count = 0; local = frame->local; priv = this->private; local->cont.opendir.checksum = GF_CALLOC (priv->child_count, - sizeof (*local->cont.opendir.checksum), - gf_afr_mt_int32_t); + sizeof (*local->cont.opendir.checksum), + gf_afr_mt_int32_t); - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_up_children_count (local->child_up, priv->child_count); local->call_count = call_count; @@ -210,7 +199,7 @@ afr_examine_dir (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->readdir, - local->fd, 131072, 0); + local->fd, 131072, 0, NULL); if (!--call_count) break; @@ -223,142 +212,139 @@ afr_examine_dir (call_frame_t *frame, xlator_t *this) int32_t afr_opendir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd) + xlator_t *this, int32_t op_ret, int32_t op_errno, + fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; - afr_local_t *local = NULL; + afr_local_t *local = NULL; int32_t up_children_count = 0; - int ret = -1; - - int call_count = -1; + int ret = -1; + int call_count = -1; + int32_t child_index = 0; priv = this->private; local = frame->local; + child_index = (long) cookie; + + up_children_count = afr_up_children_count (local->child_up, + priv->child_count); + + LOCK (&frame->lock); + { + if (op_ret >= 0) { + local->op_ret = op_ret; + ret = afr_child_fd_ctx_set (this, fd, child_index, 0); + if (ret) { + local->op_ret = -1; + local->op_errno = -ret; + goto unlock; + } + } - up_children_count = afr_up_children_count (priv->child_count, - local->child_up); + local->op_errno = op_errno; + } +unlock: + UNLOCK (&frame->lock); - LOCK (&frame->lock); - { - if (op_ret >= 0) - local->op_ret = op_ret; + call_count = afr_frame_return (frame); - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + if (call_count == 0) { + if (local->op_ret != 0) + goto out; - call_count = afr_frame_return (frame); + if (!afr_is_opendir_done (this, local->fd->inode) && + up_children_count > 1 && priv->entry_self_heal) { - if (call_count == 0) { - if (local->op_ret == 0) { + /* + * This is the first opendir on this inode. We need + * to check if the directory's entries are the same + * on all subvolumes. This is needed in addition + * to regular entry self-heal because the readdir + * call is sent only to the first subvolume, and + * thus files that exist only there will never be healed + * otherwise (assuming changelog shows no anomalies). + */ - ret = afr_fd_ctx_set (this, local->fd); + gf_log (this->name, GF_LOG_TRACE, + "reading contents of directory %s looking for mismatch", + local->loc.path); + + afr_examine_dir (frame, this); - if (ret) { - local->op_ret = -1; - local->op_errno = -1; - gf_log (this->name, GF_LOG_ERROR, " failed to " - "set fd ctx for fd %d", local->fd); - goto out; - } - if (!afr_is_opendir_done (this, local->fd->inode) && - up_children_count > 1) { - - /* - * This is the first opendir on this inode. We need - * to check if the directory's entries are the same - * on all subvolumes. This is needed in addition - * to regular entry self-heal because the readdir - * call is sent only to the first subvolume, and - * thus files that exist only there will never be healed - * otherwise (assuming changelog shows no anamolies). - */ - - gf_log (this->name, GF_LOG_TRACE, - "reading contents of directory %s looking for mismatch", - local->loc.path); - - afr_examine_dir (frame, this); - - } else { - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); - } } else { -out: - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); + /* do the unwind */ + goto out; } } - return 0; + return 0; + +out: + AFR_STACK_UNWIND (opendir, frame, local->op_ret, + local->op_errno, local->fd, NULL); + + return 0; } -int32_t +int32_t afr_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd) + loc_t *loc, fd_t *fd) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - - int child_count = 0; - int i = 0; + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + int child_count = 0; + int i = 0; + int ret = -1; + int call_count = -1; + int32_t op_errno = 0; - int ret = -1; - int call_count = -1; + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); - int32_t op_ret = -1; - int32_t op_errno = 0; + priv = this->private; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + child_count = priv->child_count; - priv = this->private; - - child_count = priv->child_count; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - ALLOC_OR_GOTO (local, afr_local_t, out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; loc_copy (&local->loc, loc); - frame->local = local; - local->fd = fd_ref (fd); + local->fd = fd_ref (fd); - call_count = local->call_count; - - for (i = 0; i < child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_opendir_cbk, - priv->children[i], - priv->children[i]->fops->opendir, - loc, fd); + call_count = local->call_count; - if (!--call_count) - break; - } - } + for (i = 0; i < child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_opendir_cbk, + (void*) (long) i, + priv->children[i], + priv->children[i]->fops->opendir, + loc, fd, NULL); - op_ret = 0; + if (!--call_count) + break; + } + } + + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (opendir, frame, op_ret, op_errno, fd); - } + if (ret < 0) + AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL); - return 0; + return 0; } /** * Common algorithm for directory read calls: - * + * * - Try the fop on the first child that is up * - if we have failed due to ENOTCONN: * try the next child @@ -372,373 +358,179 @@ struct entry_name { struct list_head list; }; - -static gf_boolean_t -remembered_name (const char *name, struct list_head *entries) -{ - struct entry_name *e; - gf_boolean_t ret = _gf_false; - - list_for_each_entry (e, entries, list) { - if (!strcmp (name, e->name)) { - ret = _gf_true; - goto out; - } - } - -out: - return ret; -} - - static void -afr_remember_entries (gf_dirent_t *entries, fd_t *fd) -{ - struct entry_name *n = NULL; - gf_dirent_t * entry = NULL; - - int ret = 0; - - uint64_t ctx; - afr_fd_ctx_t *fd_ctx; - - ret = fd_ctx_get (fd, THIS, &ctx); - if (ret < 0) { - gf_log (THIS->name, GF_LOG_DEBUG, - "could not get fd ctx for fd=%p", fd); - return; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - list_for_each_entry (entry, &entries->list, list) { - n = GF_CALLOC (1, sizeof (*n), gf_afr_mt_entry_name); - n->name = gf_strdup (entry->d_name); - INIT_LIST_HEAD (&n->list); - - list_add (&n->list, &fd_ctx->entries); - } -} - - -static off_t -afr_filter_entries (gf_dirent_t *entries, fd_t *fd) +afr_forget_entries (fd_t *fd) { - gf_dirent_t *entry, *tmp; - int ret = 0; - - uint64_t ctx; - afr_fd_ctx_t *fd_ctx; - - off_t offset = 0; - - ret = fd_ctx_get (fd, THIS, &ctx); - if (ret < 0) { - gf_log (THIS->name, GF_LOG_DEBUG, - "could not get fd ctx for fd=%p", fd); - return -1; - } + struct entry_name *entry = NULL; + struct entry_name *tmp = NULL; + int ret = 0; + uint64_t ctx = 0; + afr_fd_ctx_t *fd_ctx = NULL; + + ret = fd_ctx_get (fd, THIS, &ctx); + if (ret < 0) { + gf_log (THIS->name, GF_LOG_INFO, + "could not get fd ctx for fd=%p", fd); + return; + } fd_ctx = (afr_fd_ctx_t *)(long) ctx; - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - offset = entry->d_off; - - if (remembered_name (entry->d_name, &fd_ctx->entries)) { - list_del (&entry->list); - GF_FREE (entry); - } - } - - return offset; + list_for_each_entry_safe (entry, tmp, &fd_ctx->entries, list) { + GF_FREE (entry->name); + list_del (&entry->list); + GF_FREE (entry); + } } - static void -afr_forget_entries (fd_t *fd) +afr_readdir_filter_trash_dir (gf_dirent_t *entries, fd_t *fd) { - struct entry_name *entry, *tmp; - int ret = 0; - - uint64_t ctx; - afr_fd_ctx_t *fd_ctx; - - ret = fd_ctx_get (fd, THIS, &ctx); - if (ret < 0) { - gf_log (THIS->name, GF_LOG_DEBUG, - "could not get fd ctx for fd=%p", fd); - return; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + gf_dirent_t * entry = NULL; + gf_dirent_t * tmp = NULL; - list_for_each_entry_safe (entry, tmp, &fd_ctx->entries, list) { - GF_FREE (entry->name); - list_del (&entry->list); - GF_FREE (entry); - } + list_for_each_entry_safe (entry, tmp, &entries->list, list) { + if (__is_root_gfid (fd->inode->gfid) && + !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { + list_del_init (&entry->list); + GF_FREE (entry); + } + } } - int32_t afr_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) + xlator_t *this, int32_t op_ret, int32_t op_errno, + gf_dirent_t *entries, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; + afr_local_t *local = NULL; - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; - - int child_index = -1; - - priv = this->private; - local = frame->local; - child_index = (long) cookie; - - if (op_ret != -1) { - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - entry->d_ino = afr_itransform (entry->d_ino, - priv->child_count, - child_index); - - if ((local->fd->inode == local->fd->inode->table->root) - && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { - list_del_init (&entry->list); - GF_FREE (entry); - } - } - } + if (op_ret == -1) + goto out; - AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries); + local = frame->local; + afr_readdir_filter_trash_dir (entries, local->fd); - return 0; +out: + AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, NULL); + return 0; } int32_t afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries) + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - ino_t inum = 0; - - int call_child = 0; - int ret = 0; - - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; - - int child_index = -1; + afr_local_t *local = NULL; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - off_t offset = 0; - - priv = this->private; - children = priv->children; + if (op_ret == -1) + goto out; local = frame->local; - - child_index = (long) cookie; - - if (priv->strict_readdir) { - ret = fd_ctx_get (local->fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "could not get fd ctx for fd=%p", local->fd); - op_ret = -1; - op_errno = -ret; - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - if (child_went_down (op_ret, op_errno)) { - if (all_tried (child_index, priv->child_count)) { - goto out; - } - - call_child = ++child_index; - - gf_log (this->name, GF_LOG_TRACE, - "starting readdir afresh on child %d, offset %"PRId64, - call_child, (uint64_t) 0); - - fd_ctx->failed_over = _gf_true; - - STACK_WIND_COOKIE (frame, afr_readdirp_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readdirp, local->fd, - local->cont.readdir.size, 0); - return 0; - } - } - - if (op_ret != -1) { - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - inum = afr_itransform (entry->d_ino, priv->child_count, - child_index); - entry->d_ino = inum; - inum = afr_itransform (entry->d_stat.ia_ino, - priv->child_count, child_index); - entry->d_stat.ia_ino = inum; - - if ((local->fd->inode == local->fd->inode->table->root) - && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { - list_del_init (&entry->list); - GF_FREE (entry); - } - } - } - - if (priv->strict_readdir) { - if (fd_ctx->failed_over) { - if (list_empty (&entries->list)) { - goto out; - } - - offset = afr_filter_entries (entries, local->fd); - - afr_remember_entries (entries, local->fd); - - if (list_empty (&entries->list)) { - /* All the entries we got were duplicate. We - shouldn't send an empty list now, because - that'll make the application stop reading. So - try to get more entries */ - - gf_log (this->name, GF_LOG_TRACE, - "trying to fetch non-duplicate entries from offset %"PRId64", child %s", - offset, children[child_index]->name); - - STACK_WIND_COOKIE (frame, afr_readdirp_cbk, - (void *) (long) child_index, - children[child_index], - children[child_index]->fops->readdirp, - local->fd, local->cont.readdir.size, offset); - return 0; - } - } else { - afr_remember_entries (entries, local->fd); - } - } + afr_readdir_filter_trash_dir (entries, local->fd); out: - AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries); - + AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, NULL); return 0; } - int32_t afr_do_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, int whichop) + fd_t *fd, size_t size, off_t offset, int whichop, dict_t *dict) { - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - - uint64_t ctx; - afr_fd_ctx_t *fd_ctx; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = -1; + int32_t op_errno = 0; + uint64_t read_child = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); - int ret = -1; + priv = this->private; + children = priv->children; - int32_t op_ret = -1; - int32_t op_errno = 0; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; - priv = this->private; - children = priv->children; + local->fresh_children = afr_children_create (priv->child_count); + if (!local->fresh_children) { + op_errno = ENOMEM; + goto out; + } - ALLOC_OR_GOTO (local, afr_local_t, out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + read_child = afr_inode_get_read_ctx (this, fd->inode, + local->fresh_children); + ret = afr_get_call_child (this, local->child_up, read_child, + local->fresh_children, + &call_child, + &local->cont.readdir.last_index); + if (ret < 0) { + op_errno = -ret; + goto out; + } - frame->local = local; + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) { + op_errno = EBADF; + goto out; + } - call_child = afr_first_up_child (priv); - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); - goto out; - } + if ((offset == 0) || (fd_ctx->call_child == -1)) { + fd_ctx->call_child = call_child; + } else if ((priv->readdir_failover == _gf_false) && + (call_child != fd_ctx->call_child)) { + op_errno = EBADF; + goto out; + } local->fd = fd_ref (fd); local->cont.readdir.size = size; - - if (priv->strict_readdir) { - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "could not get fd ctx for fd=%p", fd); - op_errno = -ret; - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - if (fd_ctx->last_tried != call_child) { - gf_log (this->name, GF_LOG_TRACE, - "first up child has changed from %d to %d, restarting readdir from offset 0", - fd_ctx->last_tried, call_child); - - fd_ctx->failed_over = _gf_true; - offset = 0; - } - - fd_ctx->last_tried = call_child; - } + local->cont.readdir.dict = (dict)? dict_ref (dict) : NULL; if (whichop == GF_FOP_READDIR) STACK_WIND_COOKIE (frame, afr_readdir_cbk, (void *) (long) call_child, children[call_child], children[call_child]->fops->readdir, fd, - size, offset); + size, offset, dict); else STACK_WIND_COOKIE (frame, afr_readdirp_cbk, (void *) (long) call_child, children[call_child], children[call_child]->fops->readdirp, fd, - size, offset); + size, offset, dict); - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, NULL); - } - return 0; + AFR_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL); + return 0; } int32_t afr_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) + off_t offset, dict_t *xdata) { - afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR); + afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata); return 0; } int32_t afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) + off_t offset, dict_t *dict) { - afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP); + afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP, dict); return 0; } @@ -746,8 +538,8 @@ afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, int32_t afr_releasedir (xlator_t *this, fd_t *fd) { - afr_forget_entries (fd); + afr_forget_entries (fd); afr_cleanup_fd_ctx (this, fd); - return 0; + return 0; } diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h index e071107e6..09456d159 100644 --- a/xlators/cluster/afr/src/afr-dir-read.h +++ b/xlators/cluster/afr/src/afr-dir-read.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __DIR_READ_H__ @@ -23,28 +14,23 @@ int32_t afr_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd); + loc_t *loc, fd_t *fd, dict_t *xdata); int32_t afr_releasedir (xlator_t *this, fd_t *fd); int32_t afr_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset); + fd_t *fd, size_t size, off_t offset, dict_t *xdata); int32_t afr_readdirp (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset); - -int32_t -afr_getdents (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, int32_t flag); - + fd_t *fd, size_t size, off_t offset, dict_t *dict); int32_t afr_checksum (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags); + loc_t *loc, int32_t flags, dict_t *xdata); #endif /* __DIR_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 9f77324a3..1943b719b 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -47,28 +38,222 @@ #include "afr.h" #include "afr-transaction.h" +int +afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno) +{ + int ret = -1; + char *child_path = NULL; + + if (!child->parent) { + if (op_errno) + *op_errno = EINVAL; + goto out; + } + + child_path = gf_strdup (child->path); + if (!child_path) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + parent->path = gf_strdup( dirname (child_path) ); + if (!parent->path) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + parent->inode = inode_ref (child->parent); + uuid_copy (parent->gfid, child->pargfid); + + ret = 0; +out: + GF_FREE(child_path); + + return ret; +} + +void +__dir_entry_fop_common_cbk (call_frame_t *frame, int child_index, + xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, struct iatt *prenewparent, + struct iatt *postnewparent) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (afr_fop_failed (op_ret, op_errno)) + afr_transaction_fop_failed (frame, this, child_index); + + if (op_ret > -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) || + (child_index == local->read_child_index)) { + local->cont.dir_fop.preparent = *preparent; + local->cont.dir_fop.postparent = *postparent; + if (buf) + local->cont.dir_fop.buf = *buf; + if (prenewparent) + local->cont.dir_fop.prenewparent = *prenewparent; + if (postnewparent) + local->cont.dir_fop.postnewparent = *postnewparent; + } + + local->cont.dir_fop.inode = inode; + + local->fresh_children[local->success_count] = child_index; + local->success_count++; + local->child_errno[child_index] = 0; + } else { + local->child_errno[child_index] = op_errno; + } + + local->op_errno = op_errno; +} + +int +afr_mark_new_entry_changelog_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t *xattr, dict_t *xdata) +{ + int call_count = 0; + + call_count = afr_frame_return (frame); + if (call_count == 0) { + AFR_STACK_DESTROY (frame); + } + return 0; +} + +void +afr_mark_new_entry_changelog (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *new_frame = NULL; + afr_local_t *local = NULL; + afr_local_t *new_local = NULL; + afr_private_t *priv = NULL; + dict_t **xattr = NULL; + int32_t **changelog = NULL; + int i = 0; + GF_UNUSED int op_errno = 0; + + local = frame->local; + priv = this->private; + + new_frame = copy_frame (frame); + if (!new_frame) { + goto out; + } + + AFR_LOCAL_ALLOC_OR_GOTO (new_frame->local, out); + new_local = new_frame->local; + changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!changelog) + goto out; + + xattr = GF_CALLOC (priv->child_count, sizeof (*xattr), + gf_afr_mt_dict_t); + if (!xattr) + goto out; + for (i = 0; i < priv->child_count; i++) { + if (local->child_errno[i]) + continue; + xattr[i] = dict_new (); + if (!xattr[i]) + goto out; + } + + afr_prepare_new_entry_pending_matrix (changelog, + afr_is_errno_set, + local->child_errno, + &local->cont.dir_fop.buf, + priv->child_count); + + new_local->pending = changelog; + uuid_copy (new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid); + new_local->loc.inode = inode_ref (local->cont.dir_fop.inode); + new_local->call_count = local->success_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_errno[i]) + continue; + + afr_set_pending_dict (priv, xattr[i], changelog, i, LOCAL_LAST); + STACK_WIND_COOKIE (new_frame, afr_mark_new_entry_changelog_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->xattrop, + &new_local->loc, GF_XATTROP_ADD_ARRAY, + xattr[i], NULL); + } + new_frame = NULL; +out: + if (new_frame) + AFR_STACK_DESTROY (new_frame); + afr_xattr_array_destroy (xattr, priv->child_count); + return; +} + +gf_boolean_t +afr_is_new_entry_changelog_needed (glusterfs_fop_t fop) +{ + glusterfs_fop_t fops[] = {GF_FOP_CREATE, GF_FOP_MKNOD, GF_FOP_NULL}; + int i = 0; + + for (i = 0; fops[i] != GF_FOP_NULL; i++) { + if (fop == fops[i]) + return _gf_true; + } + return _gf_false; +} void -afr_build_parent_loc (loc_t *parent, loc_t *child) +afr_dir_fop_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this) { - char *tmp = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (local->op_ret < 0) + goto out; - if (!child->parent) { - loc_copy (parent, child); - return; - } + if (local->success_count == priv->child_count) + goto out; - tmp = gf_strdup (child->path); - parent->path = gf_strdup (dirname (tmp)); - GF_FREE (tmp); + if (!afr_is_new_entry_changelog_needed (local->op)) + goto out; - parent->name = strrchr (parent->path, '/'); - if (parent->name) - parent->name++; + afr_mark_new_entry_changelog (frame, this); - parent->inode = inode_ref (child->parent); - parent->parent = inode_parent (parent->inode, 0, NULL); - parent->ino = parent->inode->ino; +out: + return; +} + +void +afr_dir_fop_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (local->cont.dir_fop.inode == NULL) + goto done; + afr_set_read_ctx_from_policy (this, local->cont.dir_fop.inode, + local->fresh_children, + local->read_child_index, + priv->read_child, + local->cont.dir_fop.buf.ia_gfid); +done: + local->transaction.unwind (frame, this); + afr_dir_fop_mark_entry_pending_changelog (frame, this); + local->transaction.resume (frame, this); } /* {{{ create */ @@ -76,77 +261,57 @@ afr_build_parent_loc (loc_t *parent, loc_t *child) int afr_create_unwind (call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - struct iatt *unwind_buf = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - if (local->cont.create.read_child_buf.ia_ino) { - unwind_buf = &local->cont.create.read_child_buf; - } else { - unwind_buf = &local->cont.create.buf; - } + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - unwind_buf->ia_ino = local->cont.create.ino; + local = frame->local; - local->cont.create.preparent.ia_ino = local->cont.create.parent_ino; - local->cont.create.postparent.ia_ino = local->cont.create.parent_ino; + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); - AFR_STACK_UNWIND (create, main_frame, + if (main_frame) { + AFR_STACK_UNWIND (create, main_frame, local->op_ret, local->op_errno, - local->cont.create.fd, - local->cont.create.inode, - unwind_buf, &local->cont.create.preparent, - &local->cont.create.postparent); + local->cont.create.fd, + local->cont.dir_fop.inode, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + local->xdata_rsp); } - - return 0; + + return 0; } int -afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - fd_t *fd, inode_t *inode, struct iatt *buf, - struct iatt *preparent, struct iatt *postparent) +afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t *fd, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - uint64_t ctx; - afr_fd_ctx_t *fd_ctx; - - int ret = 0; - - int call_count = -1; - int child_index = -1; + afr_local_t *local = NULL; + uint64_t ctx = 0; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = 0; + int call_count = -1; + int child_index = -1; - local = frame->local; - priv = this->private; + local = frame->local; - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; + child_index = (long) cookie; + LOCK (&frame->lock); + { + if (op_ret > -1) { ret = afr_fd_ctx_set (this, fd); - if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, "could not set ctx on fd=%p", fd); @@ -157,7 +322,6 @@ afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, "could not get fd ctx for fd=%p", fd); @@ -168,194 +332,176 @@ afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, fd_ctx = (afr_fd_ctx_t *)(long) ctx; - fd_ctx->opened_on[child_index] = 1; + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; fd_ctx->flags = local->cont.create.flags; if (local->success_count == 0) { - local->cont.create.buf = *buf; - - local->cont.create.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - child_index); - - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } + if (xdata) + local->xdata_rsp = dict_ref(xdata); } - - if (child_index == local->first_up_child) { - local->cont.create.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - local->first_up_child); - } - - if (child_index == local->read_child_index) { - local->cont.create.read_child_buf = *buf; - local->cont.create.preparent = *preparent; - local->cont.create.postparent = *postparent; - } - - local->cont.create.inode = inode; - - local->success_count++; - } - - local->op_errno = op_errno; - } + } + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, inode, buf, + preparent, postparent, NULL, NULL); + } unlock: - UNLOCK (&frame->lock); + UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); + call_count = afr_frame_return (frame); - if (call_count == 0) { - local->transaction.unwind (frame, this); + if (call_count == 0) + afr_dir_fop_done (frame, this); - local->transaction.resume (frame, this); - } - - return 0; + return 0; } int afr_create_wind (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_create_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->create, - &local->loc, - local->cont.create.flags, - local->cont.create.mode, - local->cont.create.fd, - local->cont.create.params); - if (!--call_count) - break; - } - } - - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_create_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->create, + &local->loc, + local->cont.create.flags, + local->cont.create.mode, + local->umask, + local->cont.create.fd, + local->xdata_req); + if (!--call_count) + break; + } + } + + return 0; } int afr_create_done (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; + afr_local_t * local = NULL; - local = frame->local; + local = frame->local; - local->transaction.unwind (frame, this); + local->transaction.unwind (frame, this); - AFR_STACK_DESTROY (frame); + AFR_STACK_DESTROY (frame); - return 0; + return 0; } int afr_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) + loc_t *loc, int32_t flags, mode_t mode, + mode_t umask, fd_t *fd, dict_t *params) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - - int ret = -1; - - int op_ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); - priv = this->private; + priv = this->private; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } + QUORUM_CHECK(create,out); - ALLOC_OR_GOTO (local, afr_local_t, out); + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + op_errno = ENOMEM; + goto out; + } - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - transaction_frame->local = local; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; - loc_copy (&local->loc, loc); + loc_copy (&local->loc, loc); LOCK (&priv->read_child_lock); { - local->read_child_index = (++priv->read_child_rr) + local->read_child_index = (++priv->read_child_rr) % (priv->child_count); } UNLOCK (&priv->read_child_lock); - local->cont.create.flags = flags; - local->cont.create.mode = mode; - local->cont.create.fd = fd_ref (fd); + local->op = GF_FOP_CREATE; + local->cont.create.flags = flags; + local->cont.create.mode = mode; + local->cont.create.fd = fd_ref (fd); + local->umask = umask; if (params) - local->cont.create.params = dict_ref (params); - - if (loc->parent) - local->cont.create.parent_ino = loc->parent->ino; - - local->transaction.fop = afr_create_wind; - local->transaction.done = afr_create_done; - local->transaction.unwind = afr_create_unwind; - - afr_build_parent_loc (&local->transaction.parent_loc, loc); - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + local->xdata_req = dict_ref (params); + + local->transaction.fop = afr_create_wind; + local->transaction.done = afr_create_done; + local->transaction.unwind = afr_create_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (create, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL, NULL); - } + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (create, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL, NULL); + } - return 0; + return 0; } /* }}} */ @@ -365,42 +511,31 @@ out: int afr_mknod_unwind (call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - struct iatt *unwind_buf = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - if (local->cont.mknod.read_child_buf.ia_ino) { - unwind_buf = &local->cont.mknod.read_child_buf; - } else { - unwind_buf = &local->cont.mknod.buf; - } + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - unwind_buf->ia_ino = local->cont.mknod.ino; + local = frame->local; - local->cont.mknod.preparent.ia_ino = local->cont.mknod.parent_ino; - local->cont.mknod.postparent.ia_ino = local->cont.mknod.parent_ino; + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); - AFR_STACK_UNWIND (mknod, main_frame, + if (main_frame) { + AFR_STACK_UNWIND (mknod, main_frame, local->op_ret, local->op_errno, - local->cont.mknod.inode, - unwind_buf, &local->cont.mknod.preparent, - &local->cont.mknod.postparent); + local->cont.dir_fop.inode, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + NULL); } - return 0; + return 0; } @@ -408,203 +543,169 @@ int afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = -1; - - local = frame->local; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0){ - local->cont.mknod.buf = *buf; - local->cont.mknod.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - child_index); - - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } + int call_count = -1; + int child_index = -1; - if (child_index == local->first_up_child) { - local->cont.mknod.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - local->first_up_child); - } + child_index = (long) cookie; - if (child_index == local->read_child_index) { - local->cont.mknod.read_child_buf = *buf; - local->cont.mknod.preparent = *preparent; - local->cont.mknod.postparent = *postparent; - } - - local->cont.mknod.inode = inode; - - local->success_count++; - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + LOCK (&frame->lock); + { + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, inode, buf, + preparent, postparent, NULL, NULL); + } + UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); + call_count = afr_frame_return (frame); - if (call_count == 0) { - local->transaction.unwind (frame, this); + if (call_count == 0) + afr_dir_fop_done (frame, this); - local->transaction.resume (frame, this); - } - - return 0; + return 0; } int32_t afr_mknod_wind (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mknod, - &local->loc, local->cont.mknod.mode, - local->cont.mknod.dev, - local->cont.mknod.params); - if (!--call_count) - break; - } - } - - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->mknod, + &local->loc, local->cont.mknod.mode, + local->cont.mknod.dev, + local->umask, + local->xdata_req); + if (!--call_count) + break; + } + } + + return 0; } int afr_mknod_done (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; + afr_local_t * local = NULL; - local = frame->local; + local = frame->local; - local->transaction.unwind (frame, this); - AFR_STACK_DESTROY (frame); + local->transaction.unwind (frame, this); + AFR_STACK_DESTROY (frame); - return 0; + return 0; } int -afr_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t dev, dict_t *params) +afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t dev, mode_t umask, dict_t *params) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - - int ret = -1; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; - int op_ret = -1; - int op_errno = 0; + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + priv = this->private; - priv = this->private; + QUORUM_CHECK(mknod,out); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + op_errno = ENOMEM; + goto out; + } - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - transaction_frame->local = local; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; - loc_copy (&local->loc, loc); + loc_copy (&local->loc, loc); LOCK (&priv->read_child_lock); { - local->read_child_index = (++priv->read_child_rr) + local->read_child_index = (++priv->read_child_rr) % (priv->child_count); } UNLOCK (&priv->read_child_lock); - local->cont.mknod.mode = mode; - local->cont.mknod.dev = dev; + local->op = GF_FOP_MKNOD; + local->cont.mknod.mode = mode; + local->cont.mknod.dev = dev; + local->umask = umask; if (params) - local->cont.mknod.params = dict_ref (params); - - if (loc->parent) - local->cont.mknod.parent_ino = loc->parent->ino; - - local->transaction.fop = afr_mknod_wind; - local->transaction.done = afr_mknod_done; - local->transaction.unwind = afr_mknod_unwind; - - afr_build_parent_loc (&local->transaction.parent_loc, loc); - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + local->xdata_req = dict_ref (params); + + local->transaction.fop = afr_mknod_wind; + local->transaction.done = afr_mknod_done; + local->transaction.unwind = afr_mknod_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (mknod, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL); - } - - return 0; + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (mknod, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); + } + + return 0; } /* }}} */ @@ -615,42 +716,31 @@ out: int afr_mkdir_unwind (call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - struct iatt *unwind_buf = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - if (local->cont.mkdir.read_child_buf.ia_ino) { - unwind_buf = &local->cont.mkdir.read_child_buf; - } else { - unwind_buf = &local->cont.mkdir.buf; - } + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - unwind_buf->ia_ino = local->cont.mkdir.ino; + local = frame->local; - local->cont.mkdir.preparent.ia_ino = local->cont.mkdir.parent_ino; - local->cont.mkdir.postparent.ia_ino = local->cont.mkdir.parent_ino; + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); - AFR_STACK_UNWIND (mkdir, main_frame, + if (main_frame) { + AFR_STACK_UNWIND (mkdir, main_frame, local->op_ret, local->op_errno, - local->cont.mkdir.inode, - unwind_buf, &local->cont.mkdir.preparent, - &local->cont.mkdir.postparent); + local->cont.dir_fop.inode, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + NULL); } - return 0; + return 0; } @@ -658,205 +748,169 @@ int afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = -1; - - local = frame->local; - priv = this->private; - - child_index = (long) cookie; + int call_count = -1; + int child_index = -1; - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); + child_index = (long) cookie; - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0) { - local->cont.mkdir.buf = *buf; + LOCK (&frame->lock); + { + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, inode, buf, + preparent, postparent, NULL, NULL); + } + UNLOCK (&frame->lock); - local->cont.mkdir.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - child_index); + call_count = afr_frame_return (frame); - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } + if (call_count == 0) + afr_dir_fop_done (frame, this); - if (child_index == local->first_up_child) { - local->cont.mkdir.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - local->first_up_child); - } - - if (child_index == local->read_child_index) { - local->cont.mkdir.read_child_buf = *buf; - local->cont.mkdir.preparent = *preparent; - local->cont.mkdir.postparent = *postparent; - } - - local->cont.mkdir.inode = inode; + return 0; +} - local->success_count++; - } - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); +int +afr_mkdir_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; - call_count = afr_frame_return (frame); + local = frame->local; + priv = this->private; - if (call_count == 0) { - local->transaction.unwind (frame, this); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); - local->transaction.resume (frame, this); - } - - return 0; -} + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->mkdir, + &local->loc, local->cont.mkdir.mode, + local->umask, + local->xdata_req); + if (!--call_count) + break; + } + } -int -afr_mkdir_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mkdir, - &local->loc, local->cont.mkdir.mode, - local->cont.mkdir.params); - if (!--call_count) - break; - } - } - - return 0; + return 0; } int afr_mkdir_done (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; + afr_local_t * local = NULL; - local = frame->local; + local = frame->local; - local->transaction.unwind (frame, this); + local->transaction.unwind (frame, this); - AFR_STACK_DESTROY (frame); + AFR_STACK_DESTROY (frame); - return 0; + return 0; } - int afr_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *params) + loc_t *loc, mode_t mode, mode_t umask, dict_t *params) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - - int ret = -1; - - int op_ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); - priv = this->private; + priv = this->private; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } + QUORUM_CHECK(mkdir,out); - ALLOC_OR_GOTO (local, afr_local_t, out); + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + op_errno = ENOMEM; + goto out; + } - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - transaction_frame->local = local; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; - loc_copy (&local->loc, loc); + loc_copy (&local->loc, loc); LOCK (&priv->read_child_lock); { - local->read_child_index = (++priv->read_child_rr) + local->read_child_index = (++priv->read_child_rr) % (priv->child_count); } UNLOCK (&priv->read_child_lock); - local->cont.mkdir.mode = mode; + local->cont.mkdir.mode = mode; + local->umask = umask; if (params) - local->cont.mkdir.params = dict_ref (params); - - if (loc->parent) - local->cont.mkdir.parent_ino = loc->parent->ino; - - local->transaction.fop = afr_mkdir_wind; - local->transaction.done = afr_mkdir_done; - local->transaction.unwind = afr_mkdir_unwind; - - afr_build_parent_loc (&local->transaction.parent_loc, loc); - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + local->xdata_req = dict_ref (params); + + local->op = GF_FOP_MKDIR; + local->transaction.fop = afr_mkdir_wind; + local->transaction.done = afr_mkdir_done; + local->transaction.unwind = afr_mkdir_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (mkdir, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL); - } + AFR_STACK_UNWIND (mkdir, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); + } - return 0; + return 0; } /* }}} */ @@ -867,233 +921,196 @@ out: int afr_link_unwind (call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - struct iatt *unwind_buf = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - if (local->cont.link.read_child_buf.ia_ino) { - unwind_buf = &local->cont.link.read_child_buf; - } else { - unwind_buf = &local->cont.link.buf; - } + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - unwind_buf->ia_ino = local->cont.link.ino; + local = frame->local; - local->cont.link.preparent.ia_ino = local->cont.link.parent_ino; - local->cont.link.postparent.ia_ino = local->cont.link.parent_ino; + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); - AFR_STACK_UNWIND (link, main_frame, - local->op_ret, local->op_errno, - local->cont.link.inode, - unwind_buf, &local->cont.link.preparent, - &local->cont.link.postparent); - } + if (main_frame) { + AFR_STACK_UNWIND (link, main_frame, + local->op_ret, local->op_errno, + local->cont.dir_fop.inode, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + NULL); + } - return 0; + return 0; } int -afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, +afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = -1; + int call_count = -1; + int child_index = -1; - local = frame->local; - priv = this->private; + child_index = (long) cookie; - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); + LOCK (&frame->lock); + { + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, inode, buf, + preparent, postparent, NULL, NULL); + } + UNLOCK (&frame->lock); - if (op_ret != -1) { - local->op_ret = op_ret; + call_count = afr_frame_return (frame); - if (local->success_count == 0) { - local->cont.link.buf = *buf; + if (call_count == 0) + afr_dir_fop_done (frame, this); - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } + return 0; +} - if (child_index == local->read_child_index) { - local->cont.link.read_child_buf = *buf; - local->cont.link.preparent = *preparent; - local->cont.link.postparent = *postparent; - } - local->cont.link.inode = inode; +int +afr_link_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; - local->success_count++; - } + local = frame->local; + priv = this->private; - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); - call_count = afr_frame_return (frame); + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } - if (call_count == 0) { - local->transaction.unwind (frame, this); + local->call_count = call_count; - local->transaction.resume (frame, this); - } - - return 0; -} + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_link_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->link, + &local->loc, + &local->newloc, local->xdata_req); + if (!--call_count) + break; + } + } -int -afr_link_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->link, - &local->loc, - &local->newloc); - - if (!--call_count) - break; - } - } - - return 0; + return 0; } int afr_link_done (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = frame->local; + afr_local_t * local = frame->local; - local->transaction.unwind (frame, this); + local->transaction.unwind (frame, this); - AFR_STACK_DESTROY (frame); + AFR_STACK_DESTROY (frame); - return 0; + return 0; } int afr_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) + loc_t *oldloc, loc_t *newloc, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; - int ret = -1; + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); - int op_ret = -1; - int op_errno = 0; + priv = this->private; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + QUORUM_CHECK(link,out); - priv = this->private; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + op_errno = ENOMEM; + goto out; + } - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - transaction_frame->local = local; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; - loc_copy (&local->loc, oldloc); - loc_copy (&local->newloc, newloc); + loc_copy (&local->loc, oldloc); + loc_copy (&local->newloc, newloc); + if (xdata) + local->xdata_req = dict_ref (xdata); LOCK (&priv->read_child_lock); { - local->read_child_index = (++priv->read_child_rr) + local->read_child_index = (++priv->read_child_rr) % (priv->child_count); } UNLOCK (&priv->read_child_lock); - local->cont.link.ino = oldloc->inode->ino; - - if (oldloc->parent) - local->cont.link.parent_ino = newloc->parent->ino; - - local->transaction.fop = afr_link_wind; - local->transaction.done = afr_link_done; - local->transaction.unwind = afr_link_unwind; - - afr_build_parent_loc (&local->transaction.parent_loc, oldloc); - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (oldloc->path); - local->transaction.new_basename = AFR_BASENAME (newloc->path); - - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - - op_ret = 0; + local->op = GF_FOP_LINK; + local->transaction.fop = afr_link_wind; + local->transaction.done = afr_link_done; + local->transaction.unwind = afr_link_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, newloc, + &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (newloc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + ret = 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (link, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL); - } - - return 0; + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (link, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); + } + + return 0; } /* }}} */ @@ -1104,246 +1121,202 @@ out: int afr_symlink_unwind (call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - struct iatt *unwind_buf = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - if (local->cont.symlink.read_child_buf.ia_ino) { - unwind_buf = &local->cont.symlink.read_child_buf; - } else { - unwind_buf = &local->cont.symlink.buf; - } + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - unwind_buf->ia_ino = local->cont.symlink.ino; + local = frame->local; - local->cont.symlink.preparent.ia_ino = local->cont.symlink.parent_ino; - local->cont.symlink.postparent.ia_ino = local->cont.symlink.parent_ino; + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); - AFR_STACK_UNWIND (symlink, main_frame, + if (main_frame) { + AFR_STACK_UNWIND (symlink, main_frame, local->op_ret, local->op_errno, - local->cont.symlink.inode, - unwind_buf, &local->cont.symlink.preparent, - &local->cont.symlink.postparent); + local->cont.dir_fop.inode, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + NULL); } - return 0; + return 0; } int -afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, +afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = -1; - - local = frame->local; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0) { - local->cont.symlink.buf = *buf; - local->cont.symlink.ino = - afr_itransform (buf->ia_ino, priv->child_count, - child_index); - - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } + int call_count = -1; + int child_index = -1; - if (child_index == local->first_up_child) { - local->cont.symlink.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - local->first_up_child); - } - - if (child_index == local->read_child_index) { - local->cont.symlink.read_child_buf = *buf; - local->cont.symlink.preparent = *preparent; - local->cont.symlink.postparent = *postparent; - } + child_index = (long) cookie; - local->cont.symlink.inode = inode; - - local->success_count++; - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + LOCK (&frame->lock); + { + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, inode, buf, + preparent, postparent, NULL, NULL); + } + UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); + call_count = afr_frame_return (frame); - if (call_count == 0) { - local->transaction.unwind (frame, this); + if (call_count == 0) + afr_dir_fop_done (frame, this); - local->transaction.resume (frame, this); - } - - return 0; + return 0; } int afr_symlink_wind (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; - int call_count = -1; - int i = 0; + local = frame->local; + priv = this->private; - local = frame->local; - priv = this->private; + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); - call_count = afr_up_children_count (priv->child_count, local->child_up); + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + local->call_count = call_count; - local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->symlink, + local->cont.symlink.linkpath, + &local->loc, + local->umask, + local->xdata_req); - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->symlink, - local->cont.symlink.linkpath, - &local->loc, - local->cont.symlink.params); + if (!--call_count) + break; - if (!--call_count) - break; + } + } - } - } - - return 0; + return 0; } int afr_symlink_done (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = frame->local; + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); - local->transaction.unwind (frame, this); + AFR_STACK_DESTROY (frame); - AFR_STACK_DESTROY (frame); - - return 0; + return 0; } int afr_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *loc, dict_t *params) + const char *linkpath, loc_t *loc, mode_t umask, dict_t *params) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; - int ret = -1; + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); - int op_ret = -1; - int op_errno = 0; + priv = this->private; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + QUORUM_CHECK(symlink,out); - priv = this->private; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + op_errno = ENOMEM; + goto out; + } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; - transaction_frame->local = local; - - loc_copy (&local->loc, loc); + loc_copy (&local->loc, loc); LOCK (&priv->read_child_lock); { - local->read_child_index = (++priv->read_child_rr) + local->read_child_index = (++priv->read_child_rr) % (priv->child_count); } UNLOCK (&priv->read_child_lock); - local->cont.symlink.linkpath = gf_strdup (linkpath); + local->cont.symlink.linkpath = gf_strdup (linkpath); + local->umask = umask; if (params) - local->cont.symlink.params = dict_ref (params); - - if (loc->parent) - local->cont.symlink.parent_ino = loc->parent->ino; - - local->transaction.fop = afr_symlink_wind; - local->transaction.done = afr_symlink_done; - local->transaction.unwind = afr_symlink_unwind; - - afr_build_parent_loc (&local->transaction.parent_loc, loc); - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + local->xdata_req = dict_ref (params); + + local->op = GF_FOP_SYMLINK; + local->transaction.fop = afr_symlink_wind; + local->transaction.done = afr_symlink_done; + local->transaction.unwind = afr_symlink_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (symlink, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL); - } - - return 0; + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (symlink, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); + } + + return 0; } /* }}} */ @@ -1353,228 +1326,231 @@ out: int afr_rename_unwind (call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - struct iatt *unwind_buf = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - if (local->cont.rename.read_child_buf.ia_ino) { - unwind_buf = &local->cont.rename.read_child_buf; - } else { - unwind_buf = &local->cont.rename.buf; - } + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - unwind_buf->ia_ino = local->cont.rename.ino; + local = frame->local; - local->cont.rename.preoldparent.ia_ino = local->cont.rename.oldparent_ino; - local->cont.rename.postoldparent.ia_ino = local->cont.rename.oldparent_ino; - local->cont.rename.prenewparent.ia_ino = local->cont.rename.newparent_ino; - local->cont.rename.postnewparent.ia_ino = local->cont.rename.newparent_ino; + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); - AFR_STACK_UNWIND (rename, main_frame, - local->op_ret, local->op_errno, - unwind_buf, - &local->cont.rename.preoldparent, - &local->cont.rename.postoldparent, - &local->cont.rename.prenewparent, - &local->cont.rename.postnewparent); - } + if (main_frame) { + AFR_STACK_UNWIND (rename, main_frame, + local->op_ret, local->op_errno, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + &local->cont.dir_fop.prenewparent, + &local->cont.dir_fop.postnewparent, + NULL); + } - return 0; + return 0; } int -afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, +afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { - afr_local_t * local = NULL; - - int call_count = -1; - int child_index = -1; + afr_local_t * local = NULL; + int call_count = -1; + int child_index = -1; - local = frame->local; + local = frame->local; - child_index = (long) cookie; + child_index = (long) cookie; - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno) && op_errno != ENOTEMPTY) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - - if (buf) { - local->cont.rename.buf = *buf; - } - - local->success_count++; - } + LOCK (&frame->lock); + { + if (afr_fop_failed (op_ret, op_errno) && op_errno != ENOTEMPTY) + afr_transaction_fop_failed (frame, this, child_index); + local->op_errno = op_errno; + local->child_errno[child_index] = op_errno; - if (child_index == local->read_child_index) { - local->cont.rename.read_child_buf = *buf; + if (op_ret > -1) + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, NULL, buf, + preoldparent, postoldparent, + prenewparent, postnewparent); - local->cont.rename.preoldparent = *preoldparent; - local->cont.rename.postoldparent = *postoldparent; - local->cont.rename.prenewparent = *prenewparent; - local->cont.rename.postnewparent = *postnewparent; - } - } + } + UNLOCK (&frame->lock); - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + call_count = afr_frame_return (frame); - call_count = afr_frame_return (frame); + if (call_count == 0) + afr_dir_fop_done (frame, this); - if (call_count == 0) { - local->transaction.unwind (frame, this); - local->transaction.resume (frame, this); - } - - return 0; + return 0; } int32_t afr_rename_wind (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->rename, - &local->loc, - &local->newloc); - if (!--call_count) - break; - } - } - - return 0; -} + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + local = frame->local; + priv = this->private; -int -afr_rename_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } - local->transaction.unwind (frame, this); + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->rename, + &local->loc, + &local->newloc, NULL); + if (!--call_count) + break; + } + } - AFR_STACK_DESTROY (frame); - - return 0; + return 0; } int -afr_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) +afr_rename_done (call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - - int ret = -1; - - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - transaction_frame->local = local; + afr_local_t * local = frame->local; - loc_copy (&local->loc, oldloc); - loc_copy (&local->newloc, newloc); + local->transaction.unwind (frame, this); - local->read_child_index = afr_read_child (this, oldloc->inode); + AFR_STACK_DESTROY (frame); - local->cont.rename.ino = oldloc->inode->ino; - - if (oldloc->parent) - local->cont.rename.oldparent_ino = oldloc->parent->ino; - if (newloc->parent) - local->cont.rename.newparent_ino = newloc->parent->ino; - - local->transaction.fop = afr_rename_wind; - local->transaction.done = afr_rename_done; - local->transaction.unwind = afr_rename_unwind; + return 0; +} - afr_build_parent_loc (&local->transaction.parent_loc, oldloc); - afr_build_parent_loc (&local->transaction.new_parent_loc, newloc); - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (oldloc->path); - local->transaction.new_basename = AFR_BASENAME (newloc->path); +int +afr_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + int nlockee = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + QUORUM_CHECK(rename,out); + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + op_errno = ENOMEM; + goto out; + } - afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + loc_copy (&local->loc, oldloc); + loc_copy (&local->newloc, newloc); + + local->read_child_index = afr_inode_get_read_ctx (this, oldloc->inode, NULL); + + local->op = GF_FOP_RENAME; + local->transaction.fop = afr_rename_wind; + local->transaction.done = afr_rename_done; + local->transaction.unwind = afr_rename_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, oldloc, + &op_errno); + if (ret) + goto out; + ret = afr_build_parent_loc (&local->transaction.new_parent_loc, newloc, + &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (oldloc->path); + local->transaction.new_basename = AFR_BASENAME (newloc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = nlockee = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->transaction.new_parent_loc, + local->transaction.new_basename, + priv->child_count); + if (ret) + goto out; + + nlockee++; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + nlockee++; + if (local->newloc.inode && IA_ISDIR (local->newloc.inode->ia_type)) { + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->newloc, + NULL, + priv->child_count); + if (ret) + goto out; + + nlockee++; + } + qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee), + afr_entry_lockee_cmp); + int_lock->lockee_count = nlockee; + + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (rename, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL, NULL); - } + AFR_STACK_UNWIND (rename, frame, -1, op_errno, + NULL, NULL, NULL, NULL, NULL, NULL); + } - return 0; + return 0; } /* }}} */ @@ -1584,201 +1560,190 @@ out: int afr_unlink_unwind (call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - local->cont.unlink.preparent.ia_ino = local->cont.unlink.parent_ino; - local->cont.unlink.postparent.ia_ino = local->cont.unlink.parent_ino; + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); - AFR_STACK_UNWIND (unlink, main_frame, + if (main_frame) { + AFR_STACK_UNWIND (unlink, main_frame, local->op_ret, local->op_errno, - &local->cont.unlink.preparent, - &local->cont.unlink.postparent); + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + NULL); } - return 0; + return 0; } int -afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) +afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = (long) cookie; + afr_local_t * local = NULL; + int call_count = -1; + int child_index = (long) cookie; - local = frame->local; - priv = this->private; + local = frame->local; - LOCK (&frame->lock); - { + LOCK (&frame->lock); + { if (child_index == local->read_child_index) { local->read_child_returned = _gf_true; } + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, NULL, NULL, + preparent, postparent, NULL, NULL); + } + UNLOCK (&frame->lock); - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); + call_count = afr_frame_return (frame); + if (call_count == 0) + afr_dir_fop_done (frame, this); - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.unlink.preparent = *preparent; - local->cont.unlink.postparent = *postparent; - } + return 0; +} - if (child_index == local->read_child_index) { - local->cont.unlink.preparent = *preparent; - local->cont.unlink.postparent = *postparent; - } - local->success_count++; - } +int32_t +afr_unlink_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + local = frame->local; + priv = this->private; - call_count = afr_frame_return (frame); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); - if (call_count == 0) { - local->transaction.unwind (frame, this); + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } - local->transaction.resume (frame, this); - } - - return 0; -} + local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->unlink, + &local->loc, local->xflag, + local->xdata_req); -int32_t -afr_unlink_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->unlink, - &local->loc); - - if (!--call_count) - break; - } - } - - return 0; + if (!--call_count) + break; + } + } + + return 0; } int32_t afr_unlink_done (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = frame->local; + afr_local_t * local = frame->local; - local->transaction.unwind (frame, this); + local->transaction.unwind (frame, this); - AFR_STACK_DESTROY (frame); - - return 0; + AFR_STACK_DESTROY (frame); + + return 0; } int32_t afr_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc) + loc_t *loc, int xflag, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - - int ret = -1; - - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - transaction_frame->local = local; - - loc_copy (&local->loc, loc); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; - if (loc->parent) - local->cont.unlink.parent_ino = loc->parent->ino; + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); - local->transaction.fop = afr_unlink_wind; - local->transaction.done = afr_unlink_done; - local->transaction.unwind = afr_unlink_unwind; + priv = this->private; - afr_build_parent_loc (&local->transaction.parent_loc, loc); + QUORUM_CHECK(unlink,out); - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + op_errno = ENOMEM; + goto out; + } - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + loc_copy (&local->loc, loc); + local->xflag = xflag; + if (xdata) + local->xdata_req = dict_ref (xdata); + + local->op = GF_FOP_UNLINK; + local->transaction.fop = afr_unlink_wind; + local->transaction.done = afr_unlink_done; + local->transaction.unwind = afr_unlink_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (unlink, frame, op_ret, op_errno, - NULL, NULL); - } - - return 0; + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (unlink, frame, -1, op_errno, + NULL, NULL, NULL); + } + + return 0; } /* }}} */ @@ -1790,204 +1755,208 @@ out: int afr_rmdir_unwind (call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - local = frame->local; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + local = frame->local; - if (main_frame) { - local->cont.rmdir.preparent.ia_ino = local->cont.rmdir.parent_ino; - local->cont.rmdir.postparent.ia_ino = local->cont.rmdir.parent_ino; + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); - AFR_STACK_UNWIND (rmdir, main_frame, + if (main_frame) { + AFR_STACK_UNWIND (rmdir, main_frame, local->op_ret, local->op_errno, - &local->cont.rmdir.preparent, - &local->cont.rmdir.postparent); + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + NULL); } - return 0; + return 0; } int -afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) +afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int child_index = (long) cookie; + afr_local_t * local = NULL; + int call_count = -1; + int child_index = (long) cookie; int read_child = 0; - local = frame->local; - priv = this->private; + local = frame->local; - LOCK (&frame->lock); - { + LOCK (&frame->lock); + { if (child_index == read_child) { local->read_child_returned = _gf_true; } + if (afr_fop_failed (op_ret, op_errno) && (op_errno != ENOTEMPTY)) + afr_transaction_fop_failed (frame, this, child_index); + local->op_errno = op_errno; + local->child_errno[child_index] = op_errno; + if (op_ret > -1) + __dir_entry_fop_common_cbk (frame, child_index, this, + op_ret, op_errno, NULL, NULL, + preparent, postparent, NULL, + NULL); - if (afr_fop_failed (op_ret, op_errno) && (op_errno != ENOTEMPTY)) - afr_transaction_fop_failed (frame, this, child_index); + } + UNLOCK (&frame->lock); - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.rmdir.preparent = *preparent; - local->cont.rmdir.postparent = *postparent; + call_count = afr_frame_return (frame); + if (call_count == 0) + afr_dir_fop_done (frame, this); - } + return 0; +} - if (child_index == read_child) { - local->cont.rmdir.preparent = *preparent; - local->cont.rmdir.postparent = *postparent; - } - local->success_count++; - } +int +afr_rmdir_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + local = frame->local; + priv = this->private; - call_count = afr_frame_return (frame); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); - if (call_count == 0) { - local->transaction.unwind (frame, this); - local->transaction.resume (frame, this); - } - - return 0; -} + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + local->call_count = call_count; -int -afr_rmdir_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->rmdir, - &local->loc, local->cont.rmdir.flags); - - if (!--call_count) - break; - } - } - - return 0; + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->rmdir, + &local->loc, local->cont.rmdir.flags, + NULL); + + if (!--call_count) + break; + } + } + + return 0; } int afr_rmdir_done (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = frame->local; + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); - local->transaction.unwind (frame, this); + AFR_STACK_DESTROY (frame); - AFR_STACK_DESTROY (frame); - - return 0; + return 0; } int afr_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags) + loc_t *loc, int flags, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - - int ret = -1; - - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + int nlockee = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + QUORUM_CHECK(rmdir,out); + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + op_errno = ENOMEM; + goto out; + } - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - transaction_frame->local = local; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; local->cont.rmdir.flags = flags; - loc_copy (&local->loc, loc); - - if (loc->parent) - local->cont.rmdir.parent_ino = loc->parent->ino; - - local->transaction.fop = afr_rmdir_wind; - local->transaction.done = afr_rmdir_done; - local->transaction.unwind = afr_rmdir_unwind; - - afr_build_parent_loc (&local->transaction.parent_loc, loc); - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + loc_copy (&local->loc, loc); + + local->op = GF_FOP_RMDIR; + local->transaction.fop = afr_rmdir_wind; + local->transaction.done = afr_rmdir_done; + local->transaction.unwind = afr_rmdir_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = nlockee = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + nlockee++; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->loc, + NULL, + priv->child_count); + if (ret) + goto out; + + nlockee++; + qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee), + afr_entry_lockee_cmp); + int_lock->lockee_count = nlockee; + + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (rmdir, frame, op_ret, op_errno, - NULL, NULL); - } - - return 0; + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL); + } + + return 0; } /* }}} */ - diff --git a/xlators/cluster/afr/src/afr-dir-write.h b/xlators/cluster/afr/src/afr-dir-write.h index 4f180857e..02f0a3682 100644 --- a/xlators/cluster/afr/src/afr-dir-write.h +++ b/xlators/cluster/afr/src/afr-dir-write.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __DIR_WRITE_H__ @@ -23,38 +14,34 @@ int32_t afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params); + mode_t umask, fd_t *fd, dict_t *xdata); int32_t afr_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t dev, dict_t *params); + loc_t *loc, mode_t mode, dev_t dev, mode_t umask, dict_t *xdata); int32_t afr_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *params); + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata); int32_t afr_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc); + loc_t *loc, int xflag, dict_t *xdata); int32_t afr_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags); + loc_t *loc, int flags, dict_t *xdata); int32_t afr_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc); + loc_t *oldloc, loc_t *newloc, dict_t *xdata); int32_t afr_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc); + loc_t *oldloc, loc_t *newloc, dict_t *xdata); int afr_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *oldloc, dict_t *params); - -int32_t -afr_setdents (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count); + const char *linkpath, loc_t *oldloc, mode_t umask, dict_t *params); #endif /* __DIR_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index 68ac78731..e06e3b2f2 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -44,9 +35,6 @@ #include "compat-errno.h" #include "compat.h" -#include "afr.h" - - /** * Common algorithm for inode read calls: * @@ -61,114 +49,115 @@ int32_t afr_access_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) + xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - - int unwind = 1; - int last_tried = -1; - int this_try = -1; - int read_child = -1; + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + int unwind = 1; + int32_t *last_index = NULL; + int32_t next_call_child = -1; + int32_t read_child = -1; + int32_t *fresh_children = NULL; - priv = this->private; - children = priv->children; + priv = this->private; + children = priv->children; - local = frame->local; + local = frame->local; read_child = (long) cookie; - if (op_ret == -1) { - retry: - last_tried = local->cont.access.last_tried; - - if (all_tried (last_tried, priv->child_count)) { - goto out; - } - this_try = ++local->cont.access.last_tried; - - if (this_try == read_child) { - goto retry; - } + if (op_ret == -1) { + last_index = &local->cont.access.last_index; + fresh_children = local->fresh_children; + next_call_child = afr_next_call_child (fresh_children, + local->child_up, + priv->child_count, + last_index, read_child); + if (next_call_child < 0) + goto out; - unwind = 0; + unwind = 0; - STACK_WIND_COOKIE (frame, afr_access_cbk, - (void *) (long) read_child, - children[this_try], - children[this_try]->fops->access, - &local->loc, local->cont.access.mask); - } + STACK_WIND_COOKIE (frame, afr_access_cbk, + (void *) (long) read_child, + children[next_call_child], + children[next_call_child]->fops->access, + &local->loc, local->cont.access.mask, + NULL); + } out: - if (unwind) { - AFR_STACK_UNWIND (access, frame, op_ret, op_errno); - } + if (unwind) { + AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata); + } - return 0; + return 0; } int32_t -afr_access (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t mask) +afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) { - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - + afr_private_t *priv = NULL; + xlator_t **children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + int32_t op_errno = 0; int32_t read_child = -1; + int ret = -1; + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + children = priv->children; - children = priv->children; + AFR_SBRAIN_CHECK_LOC (loc, out); - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - read_child = afr_read_child (this, loc->inode); + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; - if ((read_child >= 0) && (priv->child_up[read_child])) { - call_child = read_child; + local->fresh_children = afr_children_create (priv->child_count); + if (!local->fresh_children) { + op_errno = ENOMEM; + goto out; + } - local->cont.access.last_tried = -1; - } else { - call_child = afr_first_up_child (priv); - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); - goto out; - } - - local->cont.access.last_tried = call_child; + read_child = afr_inode_get_read_ctx (this, loc->inode, + local->fresh_children); + ret = afr_get_call_child (this, local->child_up, read_child, + local->fresh_children, + &call_child, + &local->cont.access.last_index); + if (ret < 0) { + op_errno = -ret; + goto out; } - loc_copy (&local->loc, loc); - local->cont.access.mask = mask; + loc_copy (&local->loc, loc); + local->cont.access.mask = mask; - STACK_WIND_COOKIE (frame, afr_access_cbk, - (void *) (long) call_child, - children[call_child], children[call_child]->fops->access, - loc, mask); + STACK_WIND_COOKIE (frame, afr_access_cbk, + (void *) (long) call_child, + children[call_child], + children[call_child]->fops->access, + loc, mask, xdata); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (access, frame, op_ret, op_errno); - } - return 0; + if (ret < 0) + AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL); + return 0; } @@ -178,121 +167,111 @@ out: int32_t afr_stat_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf) + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct iatt *buf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - - int unwind = 1; - int last_tried = -1; - int this_try = -1; - int read_child = -1; - - priv = this->private; - children = priv->children; - - read_child = (long) cookie; - - local = frame->local; + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + int unwind = 1; + int32_t *last_index = NULL; + int32_t next_call_child = -1; + int32_t read_child = -1; + int32_t *fresh_children = NULL; + + priv = this->private; + children = priv->children; - if (op_ret == -1) { - retry: - last_tried = local->cont.stat.last_tried; + read_child = (long) cookie; - if (all_tried (last_tried, priv->child_count)) { - goto out; - } - this_try = ++local->cont.stat.last_tried; + local = frame->local; - if (this_try == read_child) { - goto retry; - } + if (op_ret == -1) { + last_index = &local->cont.stat.last_index; + fresh_children = local->fresh_children; + next_call_child = afr_next_call_child (fresh_children, + local->child_up, + priv->child_count, + last_index, read_child); + if (next_call_child < 0) + goto out; - unwind = 0; + unwind = 0; - STACK_WIND_COOKIE (frame, afr_stat_cbk, - (void *) (long) read_child, - children[this_try], - children[this_try]->fops->stat, - &local->loc); - } + STACK_WIND_COOKIE (frame, afr_stat_cbk, + (void *) (long) read_child, + children[next_call_child], + children[next_call_child]->fops->stat, + &local->loc, NULL); + } out: - if (unwind) { - if (buf) - buf->ia_ino = local->cont.stat.ino; - - AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf); - } + if (unwind) { + AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); + } - return 0; + return 0; } int32_t -afr_stat (call_frame_t *frame, xlator_t *this, - loc_t *loc) +afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t **children = NULL; + int call_child = 0; + int32_t op_errno = 0; int32_t read_child = -1; - int call_child = 0; - - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; - - ALLOC_OR_GOTO (local, afr_local_t, out); + int ret = -1; - frame->local = local; + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); - read_child = afr_read_child (this, loc->inode); + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); - if ((read_child >= 0) && (priv->child_up[read_child])) { - call_child = read_child; + children = priv->children; - local->cont.stat.last_tried = -1; + AFR_SBRAIN_CHECK_LOC (loc, out); - } else { - call_child = afr_first_up_child (priv); - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); - goto out; - } + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - local->cont.stat.last_tried = call_child; - } + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; - loc_copy (&local->loc, loc); + local->fresh_children = afr_children_create (priv->child_count); + if (!local->fresh_children) { + op_errno = ENOMEM; + goto out; + } - local->cont.stat.ino = loc->inode->ino; + read_child = afr_inode_get_read_ctx (this, loc->inode, + local->fresh_children); + ret = afr_get_call_child (this, local->child_up, read_child, + local->fresh_children, + &call_child, + &local->cont.stat.last_index); + if (ret < 0) { + op_errno = -ret; + goto out; + } + loc_copy (&local->loc, loc); - STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child, - children[call_child], - children[call_child]->fops->stat, - loc); + STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child, + children[call_child], + children[call_child]->fops->stat, + loc, xdata); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); - return 0; + return 0; } @@ -301,124 +280,122 @@ out: /* {{{ fstat */ int32_t -afr_fstat_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf) +afr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - - int unwind = 1; - int last_tried = -1; - int this_try = -1; - int read_child = -1; - - priv = this->private; - children = priv->children; - - local = frame->local; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t **children = NULL; + int unwind = 1; + int32_t *last_index = NULL; + int32_t next_call_child = -1; + int32_t read_child = -1; + int32_t *fresh_children = NULL; - read_child = (long) cookie; + priv = this->private; + children = priv->children; - if (op_ret == -1) { - retry: - last_tried = local->cont.fstat.last_tried; + local = frame->local; - if (all_tried (last_tried, priv->child_count)) { - goto out; - } - this_try = ++local->cont.fstat.last_tried; + read_child = (long) cookie; - if (this_try == read_child) { - goto retry; - } + if (op_ret == -1) { + last_index = &local->cont.fstat.last_index; + fresh_children = local->fresh_children; + next_call_child = afr_next_call_child (fresh_children, + local->child_up, + priv->child_count, + last_index, read_child); + if (next_call_child < 0) + goto out; - unwind = 0; + unwind = 0; - STACK_WIND_COOKIE (frame, afr_fstat_cbk, - (void *) (long) read_child, - children[this_try], - children[this_try]->fops->fstat, - local->fd); - } + STACK_WIND_COOKIE (frame, afr_fstat_cbk, + (void *) (long) read_child, + children[next_call_child], + children[next_call_child]->fops->fstat, + local->fd, NULL); + } out: - if (unwind) { - if (buf) - buf->ia_ino = local->cont.fstat.ino; - - AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf); - } + if (unwind) { + AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); + } - return 0; + return 0; } int32_t afr_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd) + fd_t *fd, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t **children = NULL; + int call_child = 0; + int32_t op_errno = 0; + int32_t read_child = 0; + int ret = -1; - int call_child = 0; - int32_t read_child = -1; - - int32_t op_ret = -1; - int32_t op_errno = 0; + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - VALIDATE_OR_GOTO (this->private, out); + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + children = priv->children; - children = priv->children; + VALIDATE_OR_GOTO (fd->inode, out); - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_SBRAIN_CHECK_FD (fd, out); - frame->local = local; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - VALIDATE_OR_GOTO (fd->inode, out); + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; - read_child = afr_read_child (this, fd->inode); + local->fresh_children = afr_children_create (priv->child_count); + if (!local->fresh_children) { + op_errno = ENOMEM; + goto out; + } - if ((read_child >= 0) && (priv->child_up[read_child])) { - call_child = read_child; + read_child = afr_inode_get_read_ctx (this, fd->inode, + local->fresh_children); - local->cont.fstat.last_tried = -1; - } else { - call_child = afr_first_up_child (priv); - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); - goto out; - } - local->cont.fstat.last_tried = call_child; + ret = afr_get_call_child (this, local->child_up, read_child, + local->fresh_children, + &call_child, + &local->cont.fstat.last_index); + if (ret < 0) { + op_errno = -ret; + goto out; } - local->cont.fstat.ino = fd->inode->ino; - local->fd = fd_ref (fd); + local->fd = fd_ref (fd); + + afr_open_fd_fix (fd, this); - STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child, - children[call_child], - children[call_child]->fops->fstat, - fd); + STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child, + children[call_child], + children[call_child]->fops->fstat, + fd, xdata); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); - return 0; + return 0; } /* }}} */ @@ -427,122 +404,115 @@ out: int32_t afr_readlink_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - const char *buf, struct iatt *sbuf) + xlator_t *this, int32_t op_ret, int32_t op_errno, + const char *buf, struct iatt *sbuf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - - int unwind = 1; - int last_tried = -1; - int this_try = -1; - int read_child = -1; + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + int unwind = 1; + int32_t *last_index = NULL; + int32_t next_call_child = -1; + int32_t read_child = -1; + int32_t *fresh_children = NULL; - priv = this->private; - children = priv->children; + priv = this->private; + children = priv->children; - local = frame->local; + local = frame->local; read_child = (long) cookie; - if (op_ret == -1) { - retry: - last_tried = local->cont.readlink.last_tried; - - if (all_tried (last_tried, priv->child_count)) { - goto out; - } - this_try = ++local->cont.readlink.last_tried; - - if (this_try == read_child) { - goto retry; - } + if (op_ret == -1) { + last_index = &local->cont.readlink.last_index; + fresh_children = local->fresh_children; + next_call_child = afr_next_call_child (fresh_children, + local->child_up, + priv->child_count, + last_index, read_child); + if (next_call_child < 0) + goto out; - unwind = 0; - STACK_WIND_COOKIE (frame, afr_readlink_cbk, - (void *) (long) read_child, - children[this_try], - children[this_try]->fops->readlink, - &local->loc, - local->cont.readlink.size); - } + unwind = 0; + STACK_WIND_COOKIE (frame, afr_readlink_cbk, + (void *) (long) read_child, + children[next_call_child], + children[next_call_child]->fops->readlink, + &local->loc, + local->cont.readlink.size, NULL); + } out: - if (unwind) { - if (sbuf) - sbuf->ia_ino = local->cont.readlink.ino; - - AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, buf, sbuf); - } + if (unwind) { + AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, buf, sbuf, + xdata); + } - return 0; + return 0; } int32_t afr_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size) + loc_t *loc, size_t size, dict_t *xdata) { - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - + afr_private_t *priv = NULL; + xlator_t **children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + int32_t op_errno = 0; int32_t read_child = -1; + int ret = -1; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); - ALLOC_OR_GOTO (local, afr_local_t, out); + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); - frame->local = local; + children = priv->children; - read_child = afr_read_child (this, loc->inode); + AFR_SBRAIN_CHECK_LOC (loc, out); - if ((read_child >= 0) && (priv->child_up[read_child])) { - call_child = read_child; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - local->cont.readlink.last_tried = -1; - - } else { - call_child = afr_first_up_child (priv); - - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); - goto out; - } + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; - local->cont.readlink.last_tried = call_child; + local->fresh_children = afr_children_create (priv->child_count); + if (!local->fresh_children) { + op_errno = ENOMEM; + goto out; + } + read_child = afr_inode_get_read_ctx (this, loc->inode, + local->fresh_children); + ret = afr_get_call_child (this, local->child_up, read_child, + local->fresh_children, + &call_child, + &local->cont.readlink.last_index); + if (ret < 0) { + op_errno = -ret; + goto out; } - loc_copy (&local->loc, loc); + loc_copy (&local->loc, loc); - local->cont.readlink.size = size; - local->cont.readlink.ino = loc->inode->ino; + local->cont.readlink.size = size; - STACK_WIND_COOKIE (frame, afr_readlink_cbk, - (void *) (long) call_child, - children[call_child], children[call_child]->fops->readlink, - loc, size); + STACK_WIND_COOKIE (frame, afr_readlink_cbk, + (void *) (long) call_child, + children[call_child], + children[call_child]->fops->readlink, + loc, size, xdata); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, NULL, NULL); - } - return 0; + if (ret < 0) + AFR_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } @@ -556,7 +526,7 @@ struct _xattr_key { }; -void +int __gather_xattr_keys (dict_t *dict, char *key, data_t *value, void *data) { @@ -568,23 +538,23 @@ __gather_xattr_keys (dict_t *dict, char *key, data_t *value, xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key); if (!xkey) - return; + return -1; xkey->key = key; INIT_LIST_HEAD (&xkey->list); list_add_tail (&xkey->list, list); } + return 0; } void __filter_xattrs (dict_t *dict) { - struct list_head keys; - - struct _xattr_key *key; - struct _xattr_key *tmp; + struct list_head keys = {0,}; + struct _xattr_key *key = NULL; + struct _xattr_key *tmp = NULL; INIT_LIST_HEAD (&keys); @@ -604,129 +574,1222 @@ __filter_xattrs (dict_t *dict) int32_t afr_getxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict) + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + int unwind = 1; + int32_t *last_index = NULL; + int32_t next_call_child = -1; + int32_t read_child = -1; + int32_t *fresh_children = NULL; - int unwind = 1; - int last_tried = -1; - int this_try = -1; - int read_child = -1; + priv = this->private; + children = priv->children; - priv = this->private; - children = priv->children; - - local = frame->local; + local = frame->local; read_child = (long) cookie; - if (op_ret == -1) { - retry: - last_tried = local->cont.getxattr.last_tried; - - if (all_tried (last_tried, priv->child_count)) { - goto out; - } - this_try = ++local->cont.getxattr.last_tried; - - if (this_try == read_child) { - goto retry; - } + if (op_ret == -1) { + last_index = &local->cont.getxattr.last_index; + fresh_children = local->fresh_children; + next_call_child = afr_next_call_child (fresh_children, + local->child_up, + priv->child_count, + last_index, read_child); + if (next_call_child < 0) + goto out; - unwind = 0; - STACK_WIND_COOKIE (frame, afr_getxattr_cbk, - (void *) (long) read_child, - children[this_try], - children[this_try]->fops->getxattr, - &local->loc, - local->cont.getxattr.name); - } + unwind = 0; + STACK_WIND_COOKIE (frame, afr_getxattr_cbk, + (void *) (long) read_child, + children[next_call_child], + children[next_call_child]->fops->getxattr, + &local->loc, + local->cont.getxattr.name, + NULL); + } out: - if (unwind) { + if (unwind) { if (op_ret >= 0 && dict) __filter_xattrs (dict); - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); - } + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); + } - return 0; + return 0; } int32_t -afr_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) +afr_getxattr_unwind (call_frame_t *frame, int op_ret, int op_errno, + dict_t *dict, dict_t *xdata) + +{ + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + +int32_t +afr_fgetxattr_clrlk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) { - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t * local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + dict_t *xattr = NULL; + char *tmp_report = NULL; + char lk_summary[1024] = {0,}; + int serz_len = 0; + int32_t callcnt = 0; + long int cky = 0; + int ret = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + cky = (long) cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) + local->child_errno[cky] = op_errno; + + if (!local->dict) + local->dict = dict_new (); + if (local->dict) { + ret = dict_get_str (dict, local->cont.getxattr.name, + &tmp_report); + if (ret) + goto unlock; + ret = dict_set_dynstr (local->dict, + children[cky]->name, + gf_strdup (tmp_report)); + if (ret) + goto unlock; + } + } +unlock: + UNLOCK (&frame->lock); + + if (!callcnt) { + xattr = dict_new (); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + ret = dict_serialize_value_with_delim (local->dict, + lk_summary, + &serz_len, '\n'); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Error serializing dictionary"); + goto unwind; + } + if (serz_len == -1) + snprintf (lk_summary, sizeof (lk_summary), + "No locks cleared."); + ret = dict_set_dynstr (xattr, local->cont.getxattr.name, + gf_strdup (lk_summary)); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Error setting dictionary"); + goto unwind; + } + + unwind: + // Updating child_errno with more recent 'events' + local->child_errno[cky] = op_errno; + op_errno = afr_resultant_errno_get (NULL, local->child_errno, + priv->child_count); + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr, + xdata); + + if (xattr) + dict_unref (xattr); + } + + return ret; +} + +int32_t +afr_getxattr_clrlk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + dict_t *xattr = NULL; + char *tmp_report = NULL; + char lk_summary[1024] = {0,}; + int serz_len = 0; + int32_t callcnt = 0; + long int cky = 0; + int ret = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + cky = (long) cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) + local->child_errno[cky] = op_errno; + + if (!local->dict) + local->dict = dict_new (); + if (local->dict) { + ret = dict_get_str (dict, local->cont.getxattr.name, + &tmp_report); + if (ret) + goto unlock; + ret = dict_set_dynstr (local->dict, + children[cky]->name, + gf_strdup (tmp_report)); + if (ret) + goto unlock; + } + } +unlock: + UNLOCK (&frame->lock); + + if (!callcnt) { + xattr = dict_new (); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + ret = dict_serialize_value_with_delim (local->dict, + lk_summary, + &serz_len, '\n'); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Error serializing dictionary"); + goto unwind; + } + if (serz_len == -1) + snprintf (lk_summary, sizeof (lk_summary), + "No locks cleared."); + ret = dict_set_dynstr (xattr, local->cont.getxattr.name, + gf_strdup (lk_summary)); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Error setting dictionary"); + goto unwind; + } + + unwind: + // Updating child_errno with more recent 'events' + local->child_errno[cky] = op_errno; + op_errno = afr_resultant_errno_get (NULL, local->child_errno, + priv->child_count); + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata); + + if (xattr) + dict_unref (xattr); + } + + return ret; +} + +/** + * node-uuid cbk uses next child querying mechanism + */ +int32_t +afr_getxattr_node_uuid_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t **children = NULL; + int unwind = 1; + int curr_call_child = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { /** query the _next_ child */ + + /** + * _current_ becomes _next_ + * If done with all childs and yet no success; give up ! + */ + curr_call_child = (int) ((long)cookie); + if (++curr_call_child == priv->child_count) + goto unwind; + + gf_log (this->name, GF_LOG_WARNING, + "op_ret (-1): Re-querying afr-child (%d/%d)", + curr_call_child, priv->child_count); + + unwind = 0; + STACK_WIND_COOKIE (frame, afr_getxattr_node_uuid_cbk, + (void *) (long) curr_call_child, + children[curr_call_child], + children[curr_call_child]->fops->getxattr, + &local->loc, + local->cont.getxattr.name, + NULL); + } + + unwind: + if (unwind) + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, + NULL); + + return 0; +} + +int32_t +afr_getxattr_lockinfo_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + int call_cnt = 0, len = 0; + char *lockinfo_buf = NULL; + dict_t *lockinfo = NULL, *newdict = NULL; + afr_local_t *local = NULL; + + LOCK (&frame->lock); + { + local = frame->local; + + call_cnt = --local->call_count; + + if ((op_ret < 0) || (!dict && !xdata)) { + goto unlock; + } + + if (xdata) { + if (!local->xdata_rsp) { + local->xdata_rsp = dict_new (); + if (!local->xdata_rsp) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } + + if (!dict) { + goto unlock; + } + + op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY, + (void **)&lockinfo_buf, &len); + + if (!lockinfo_buf) { + goto unlock; + } + + if (!local->dict) { + local->dict = dict_new (); + if (!local->dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } +unlock: + UNLOCK (&frame->lock); + + if (lockinfo_buf != NULL) { + lockinfo = dict_new (); + if (lockinfo == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + } else { + op_ret = dict_unserialize (lockinfo_buf, len, + &lockinfo); + + if (lockinfo && local->dict) { + dict_copy (lockinfo, local->dict); + } + } + } + + if (xdata && local->xdata_rsp) { + dict_copy (xdata, local->xdata_rsp); + } + + if (!call_cnt) { + newdict = dict_new (); + if (!newdict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } + + len = dict_serialized_length (local->dict); + if (len == 0) { + goto unwind; + } + + lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char); + if (!lockinfo_buf) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } + + op_ret = dict_serialize (local->dict, lockinfo_buf); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + } + + op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY, + (void *)lockinfo_buf, len); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + goto unwind; + } + + unwind: + AFR_STACK_UNWIND (getxattr, frame, op_ret, + op_errno, newdict, + local->xdata_rsp); + } + + dict_unref (lockinfo); + + return 0; +} + +int32_t +afr_fgetxattr_lockinfo_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + int call_cnt = 0, len = 0; + char *lockinfo_buf = NULL; + dict_t *lockinfo = NULL, *newdict = NULL; + afr_local_t *local = NULL; + + LOCK (&frame->lock); + { + local = frame->local; + + call_cnt = --local->call_count; + + if ((op_ret < 0) || (!dict && !xdata)) { + goto unlock; + } + + if (xdata) { + if (!local->xdata_rsp) { + local->xdata_rsp = dict_new (); + if (!local->xdata_rsp) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } + + if (!dict) { + goto unlock; + } + + op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY, + (void **)&lockinfo_buf, &len); + + if (!lockinfo_buf) { + goto unlock; + } + + if (!local->dict) { + local->dict = dict_new (); + if (!local->dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } +unlock: + UNLOCK (&frame->lock); + + if (lockinfo_buf != NULL) { + lockinfo = dict_new (); + if (lockinfo == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + } else { + op_ret = dict_unserialize (lockinfo_buf, len, + &lockinfo); + + if (lockinfo && local->dict) { + dict_copy (lockinfo, local->dict); + } + } + } + + if (xdata && local->xdata_rsp) { + dict_copy (xdata, local->xdata_rsp); + } + + if (!call_cnt) { + newdict = dict_new (); + if (!newdict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } + + len = dict_serialized_length (local->dict); + if (len <= 0) { + goto unwind; + } - int read_child = -1; + lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char); + if (!lockinfo_buf) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } - int32_t op_ret = -1; - int32_t op_errno = 0; + op_ret = dict_serialize (local->dict, lockinfo_buf); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + } + op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY, + (void *)lockinfo_buf, len); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + goto unwind; + } - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + unwind: + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, + op_errno, newdict, + local->xdata_rsp); + } - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + dict_unref (lockinfo); - children = priv->children; + return 0; +} - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; +int32_t +afr_fgetxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t callcnt = 0; + int ret = 0; + char *xattr = NULL; + char *xattr_serz = NULL; + char xattr_cky[1024] = {0,}; + dict_t *nxattr = NULL; + long cky = 0; + int32_t padding = 0; + int32_t tlen = 0; + + if (!frame || !frame->local || !this) { + gf_log ("", GF_LOG_ERROR, "possible NULL deref"); + goto out; + } - if (name) { - if (!strncmp (name, AFR_XATTR_PREFIX, - strlen (AFR_XATTR_PREFIX))) { + local = frame->local; + cky = (long) cookie; - op_errno = ENODATA; + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (!dict || (op_ret < 0)) goto out; + + if (!local->dict) + local->dict = dict_new (); + + if (local->dict) { + ret = dict_get_str (dict, + local->cont.getxattr.name, + &xattr); + if (ret) + goto out; + + xattr = gf_strdup (xattr); + + (void)snprintf (xattr_cky, 1024, "%s-%ld", + local->cont.getxattr.name, cky); + ret = dict_set_dynstr (local->dict, + xattr_cky, xattr); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Cannot set xattr cookie key"); + goto out; + } + + local->cont.getxattr.xattr_len + += strlen (xattr) + 1; + } + } +out: + UNLOCK (&frame->lock); + + if (!callcnt) { + if (!local->cont.getxattr.xattr_len) + goto unwind; + + nxattr = dict_new (); + if (!nxattr) + goto unwind; + + /* extra bytes for decorations (brackets and <>'s) */ + padding += strlen (this->name) + + strlen (AFR_PATHINFO_HEADER) + 4; + local->cont.getxattr.xattr_len += (padding + 2); + + xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len, + sizeof (char), gf_common_mt_char); + + if (!xattr_serz) + goto unwind; + + /* the xlator info */ + (void) sprintf (xattr_serz, "(<"AFR_PATHINFO_HEADER"%s> ", + this->name); + + /* actual series of pathinfo */ + ret = dict_serialize_value_with_delim (local->dict, + xattr_serz + + strlen (xattr_serz), + &tlen, ' '); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Error serializing" + " dictionary"); + goto unwind; + } + + /* closing part */ + *(xattr_serz + padding + tlen) = ')'; + *(xattr_serz + padding + tlen + 1) = '\0'; + + ret = dict_set_dynstr (nxattr, local->cont.getxattr.name, + xattr_serz); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo" + " key in dict"); + + unwind: + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, nxattr, + xdata); + + if (nxattr) + dict_unref (nxattr); + } + + return ret; +} + +int32_t +afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t callcnt = 0; + int ret = 0; + char *xattr = NULL; + char *xattr_serz = NULL; + char xattr_cky[1024] = {0,}; + dict_t *nxattr = NULL; + long cky = 0; + int32_t padding = 0; + int32_t tlen = 0; + + if (!frame || !frame->local || !this) { + gf_log ("", GF_LOG_ERROR, "possible NULL deref"); + goto out; + } + + local = frame->local; + cky = (long) cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (!dict || (op_ret < 0)) + goto out; + + if (!local->dict) + local->dict = dict_new (); + + if (local->dict) { + ret = dict_get_str (dict, + local->cont.getxattr.name, + &xattr); + if (ret) + goto out; + + xattr = gf_strdup (xattr); + + (void)snprintf (xattr_cky, 1024, "%s-%ld", + local->cont.getxattr.name, cky); + ret = dict_set_dynstr (local->dict, + xattr_cky, xattr); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Cannot set xattr cookie key"); + goto out; + } + + local->cont.getxattr.xattr_len += strlen (xattr) + 1; + } + } + out: + UNLOCK (&frame->lock); + + if (!callcnt) { + if (!local->cont.getxattr.xattr_len) + goto unwind; + + nxattr = dict_new (); + if (!nxattr) + goto unwind; + + /* extra bytes for decorations (brackets and <>'s) */ + padding += strlen (this->name) + strlen (AFR_PATHINFO_HEADER) + 4; + local->cont.getxattr.xattr_len += (padding + 2); + + xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len, + sizeof (char), gf_common_mt_char); + + if (!xattr_serz) + goto unwind; + + /* the xlator info */ + (void) sprintf (xattr_serz, "(<"AFR_PATHINFO_HEADER"%s> ", + this->name); + + /* actual series of pathinfo */ + ret = dict_serialize_value_with_delim (local->dict, + xattr_serz + strlen (xattr_serz), + &tlen, ' '); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Error serializing" + " dictionary"); + goto unwind; + } + + /* closing part */ + *(xattr_serz + padding + tlen) = ')'; + *(xattr_serz + padding + tlen + 1) = '\0'; + + ret = dict_set_dynstr (nxattr, local->cont.getxattr.name, + xattr_serz); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo" + " key in dict"); + + unwind: + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, nxattr, + xdata); + + if (nxattr) + dict_unref (nxattr); + } + + return ret; +} + +static int +afr_aggregate_stime_xattr (dict_t *this, char *key, data_t *value, void *data) +{ + int ret = 0; + + if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) + ret = gf_get_min_stime (THIS, data, key, value); + + return ret; +} + +int32_t +afr_common_getxattr_stime_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t callcnt = 0; + + if (!frame || !frame->local || !this) { + gf_log ("", GF_LOG_ERROR, "possible NULL deref"); + goto out; + } + + local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (!dict || (op_ret < 0)) { + local->op_errno = op_errno; + goto cleanup; } + if (!local->dict) + local->dict = dict_copy_with_ref (dict, NULL); + else + dict_foreach (dict, afr_aggregate_stime_xattr, + local->dict); + local->op_ret = 0; + } + +cleanup: + UNLOCK (&frame->lock); + + if (!callcnt) { + AFR_STACK_UNWIND (getxattr, frame, local->op_ret, + local->op_errno, local->dict, xdata); } - read_child = afr_read_child (this, loc->inode); +out: + return 0; +} - if ((read_child >= 0) && (priv->child_up[read_child])) { - call_child = read_child; - local->cont.getxattr.last_tried = -1; +static gf_boolean_t +afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk, + gf_boolean_t is_fgetxattr) +{ + gf_boolean_t is_spl = _gf_true; + + GF_ASSERT (cbk); + if (!cbk) { + is_spl = _gf_false; + goto out; + } + + if (!strcmp (name, GF_XATTR_PATHINFO_KEY)) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_pathinfo_cbk; + } else { + *cbk = afr_getxattr_pathinfo_cbk; + } + } else if (!strncmp (name, GF_XATTR_CLRLK_CMD, + strlen (GF_XATTR_CLRLK_CMD))) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_clrlk_cbk; + } else { + *cbk = afr_getxattr_clrlk_cbk; + } + } else if (!strncmp (name, GF_XATTR_LOCKINFO_KEY, + strlen (GF_XATTR_LOCKINFO_KEY))) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_lockinfo_cbk; + } else { + *cbk = afr_getxattr_lockinfo_cbk; + } + } else if (fnmatch (GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) { + *cbk = afr_common_getxattr_stime_cbk; } else { - call_child = afr_first_up_child (priv); + is_spl = _gf_false; + } + +out: + return is_spl; +} + +static void +afr_getxattr_frm_all_children (xlator_t *this, call_frame_t *frame, + const char *name, loc_t *loc, + fop_getxattr_cbk_t cbk) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t **children = NULL; + int i = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + local->call_count = priv->child_count; + + for (i = 0; i < priv->child_count; i++) { + STACK_WIND_COOKIE (frame, cbk, + (void *) (long) i, + children[i], children[i]->fops->getxattr, + loc, name, NULL); + } + return; +} + +int32_t +afr_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) +{ + afr_private_t *priv = NULL; + xlator_t **children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + xlator_list_t *trav = NULL; + xlator_t **sub_volumes = NULL; + int i = 0; + int32_t op_errno = 0; + int32_t read_child = -1; + int ret = -1; + fop_getxattr_cbk_t cbk = NULL; + int afr_xtime_gauge[MCNT_MAX] = {0,}; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + AFR_SBRAIN_CHECK_LOC (loc, out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + loc_copy (&local->loc, loc); + if (!name) + goto no_name; + + local->cont.getxattr.name = gf_strdup (name); - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); + if (!strncmp (name, AFR_XATTR_PREFIX, + strlen (AFR_XATTR_PREFIX))) { + gf_log (this->name, GF_LOG_INFO, + "%s: no data present for key %s", + loc->path, name); + op_errno = ENODATA; + goto out; + } + if ((strcmp (GF_XATTR_MARKER_KEY, name) == 0) + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { + + local->marker.call_count = priv->child_count; + + sub_volumes = alloca ( priv->child_count * sizeof (xlator_t *)); + for (i = 0, trav = this->children; trav ; + trav = trav->next, i++) { + + *(sub_volumes + i) = trav->xlator; + } + + if (cluster_getmarkerattr (frame, this, loc, name, + local, afr_getxattr_unwind, + sub_volumes, + priv->child_count, + MARKER_UUID_TYPE, + marker_uuid_default_gauge, + priv->vol_uuid)) { + + gf_log (this->name, GF_LOG_INFO, + "%s: failed to get marker attr (%s)", + loc->path, name); + op_errno = EINVAL; goto out; } - local->cont.getxattr.last_tried = call_child; + return 0; + } + + /* + * if we are doing getxattr with pathinfo as the key then we + * collect information from all childs + */ + if (afr_is_special_xattr (name, &cbk, 0)) { + afr_getxattr_frm_all_children (this, frame, name, + loc, cbk); + return 0; + } + + if (XATTR_IS_NODE_UUID (name)) { + i = 0; + STACK_WIND_COOKIE (frame, afr_getxattr_node_uuid_cbk, + (void *) (long) i, + children[i], + children[i]->fops->getxattr, + loc, name, xdata); + return 0; + } + + if (*priv->vol_uuid) { + if ((match_uuid_local (name, priv->vol_uuid) == 0) + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { + local->marker.call_count = priv->child_count; + + sub_volumes = alloca ( priv->child_count + * sizeof (xlator_t *)); + for (i = 0, trav = this->children; trav ; + trav = trav->next, i++) { + + *(sub_volumes + i) = trav->xlator; + + } + + /* don't err out on getting ENOTCONN (brick down) + * from a subset of the bricks + */ + memcpy (afr_xtime_gauge, marker_xtime_default_gauge, + sizeof (afr_xtime_gauge)); + afr_xtime_gauge[MCNT_NOTFOUND] = 0; + afr_xtime_gauge[MCNT_ENOTCONN] = 0; + if (cluster_getmarkerattr (frame, this, loc, + name, local, + afr_getxattr_unwind, + sub_volumes, + priv->child_count, + MARKER_XTIME_TYPE, + afr_xtime_gauge, + priv->vol_uuid)) { + gf_log (this->name, GF_LOG_INFO, + "%s: failed to get marker attr (%s)", + loc->path, name); + op_errno = EINVAL; + goto out; + } + + return 0; + } + } + +no_name: + local->fresh_children = afr_children_create (priv->child_count); + if (!local->fresh_children) { + op_errno = ENOMEM; + goto out; } - loc_copy (&local->loc, loc); - if (name) - local->cont.getxattr.name = gf_strdup (name); + read_child = afr_inode_get_read_ctx (this, loc->inode, + local->fresh_children); + ret = afr_get_call_child (this, local->child_up, read_child, + local->fresh_children, + &call_child, + &local->cont.getxattr.last_index); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + STACK_WIND_COOKIE (frame, afr_getxattr_cbk, + (void *) (long) call_child, + children[call_child], + children[call_child]->fops->getxattr, + loc, name, xdata); + + ret = 0; +out: + if (ret < 0) + AFR_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); + return 0; +} + +/* {{{ fgetxattr */ - STACK_WIND_COOKIE (frame, afr_getxattr_cbk, - (void *) (long) call_child, - children[call_child], children[call_child]->fops->getxattr, - loc, name); - op_ret = 0; +int32_t +afr_fgetxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + int unwind = 1; + int32_t *last_index = NULL; + int32_t next_call_child = -1; + int32_t read_child = -1; + int32_t *fresh_children = NULL; + + priv = this->private; + children = priv->children; + + local = frame->local; + + read_child = (long) cookie; + + if (op_ret == -1) { + last_index = &local->cont.getxattr.last_index; + fresh_children = local->fresh_children; + next_call_child = afr_next_call_child (fresh_children, + local->child_up, + priv->child_count, + last_index, read_child); + if (next_call_child < 0) + goto out; + + unwind = 0; + STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, + (void *) (long) read_child, + children[next_call_child], + children[next_call_child]->fops->fgetxattr, + local->fd, + local->cont.getxattr.name, + NULL); + } + out: - if (op_ret == -1) { - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, NULL); - } - return 0; + if (unwind) { + if (op_ret >= 0 && dict) + __filter_xattrs (dict); + + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, + xdata); + } + + return 0; +} + +int32_t +afr_fgetxattr_unwind (call_frame_t *frame, + int op_ret, int op_errno, dict_t *dict, dict_t *xdata) + +{ + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + +static void +afr_fgetxattr_frm_all_children (xlator_t *this, call_frame_t *frame, + const char *name, fd_t *fd, + fop_fgetxattr_cbk_t cbk) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t **children = NULL; + int i = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + local->call_count = priv->child_count; + + for (i = 0; i < priv->child_count; i++) { + STACK_WIND_COOKIE (frame, cbk, + (void *) (long) i, + children[i], children[i]->fops->fgetxattr, + fd, name, NULL); + } + + return; +} + +int32_t +afr_fgetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + afr_private_t *priv = NULL; + xlator_t **children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + int32_t read_child = -1; + fop_fgetxattr_cbk_t cbk = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + AFR_SBRAIN_CHECK_FD (fd, out); + + AFR_LOCAL_ALLOC_OR_GOTO (local, out); + frame->local = local; + + op_ret = afr_local_init (local, priv, &op_errno); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + local->fd = fd_ref (fd); + if (name) + local->cont.getxattr.name = gf_strdup (name); + + /* pathinfo gets handled only in getxattr(), but we need to handle + * lockinfo. + * If we are doing fgetxattr with lockinfo as the key then we + * collect information from all children. + */ + if (afr_is_special_xattr (name, &cbk, 1)) { + afr_fgetxattr_frm_all_children (this, frame, name, + fd, cbk); + return 0; + } + + + local->fresh_children = afr_children_create (priv->child_count); + if (!local->fresh_children) { + op_errno = ENOMEM; + goto out; + } + + read_child = afr_inode_get_read_ctx (this, fd->inode, + local->fresh_children); + op_ret = afr_get_call_child (this, local->child_up, read_child, + local->fresh_children, + &call_child, + &local->cont.getxattr.last_index); + if (op_ret < 0) { + op_errno = -op_ret; + op_ret = -1; + goto out; + } + + STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, + (void *) (long) call_child, + children[call_child], + children[call_child]->fops->fgetxattr, + fd, name, xdata); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, NULL, + NULL); + } + return 0; } @@ -748,140 +1811,130 @@ out: int32_t afr_readv_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, struct iatt *buf, - struct iobref *iobref) + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, struct iatt *buf, + struct iobref *iobref, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - - int unwind = 1; - int last_tried = -1; - int this_try = -1; - int read_child = -1; + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + int unwind = 1; + int32_t *last_index = NULL; + int32_t next_call_child = -1; + int32_t *fresh_children = NULL; + int32_t read_child = -1; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); - children = priv->children; + children = priv->children; - local = frame->local; + local = frame->local; read_child = (long) cookie; - if (op_ret == -1) { - retry: - last_tried = local->cont.readv.last_tried; - - if (all_tried (last_tried, priv->child_count)) { - goto out; - } - this_try = ++local->cont.readv.last_tried; - - if (this_try == read_child) { - /* - skip the read child since if we are here - we must have already tried that child - */ - goto retry; - } - - unwind = 0; - - STACK_WIND_COOKIE (frame, afr_readv_cbk, - (void *) (long) read_child, - children[this_try], - children[this_try]->fops->readv, - local->fd, local->cont.readv.size, - local->cont.readv.offset); - } + if (op_ret == -1) { + last_index = &local->cont.readv.last_index; + fresh_children = local->fresh_children; + next_call_child = afr_next_call_child (fresh_children, + local->child_up, + priv->child_count, + last_index, read_child); + if (next_call_child < 0) + goto out; -out: - if (unwind) { - if (buf && local) - buf->ia_ino = local->cont.readv.ino; + unwind = 0; - AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, - vector, count, buf, iobref); - } + STACK_WIND_COOKIE (frame, afr_readv_cbk, + (void *) (long) read_child, + children[next_call_child], + children[next_call_child]->fops->readv, + local->fd, local->cont.readv.size, + local->cont.readv.offset, + local->cont.readv.flags, + NULL); + } - return 0; +out: + if (unwind) { + AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, + vector, count, buf, iobref, xdata); + } + + return 0; } int32_t afr_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset) + fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + int call_child = 0; + int32_t op_errno = 0; int32_t read_child = -1; - int call_child = 0; + int ret = -1; - int32_t op_ret = -1; - int32_t op_errno = 0; + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (fd, out); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (fd, out); + priv = this->private; + children = priv->children; - priv = this->private; - children = priv->children; + AFR_SBRAIN_CHECK_FD (fd, out); - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - frame->local = local; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; - read_child = afr_read_child (this, fd->inode); + local->fresh_children = afr_children_create (priv->child_count); + if (!local->fresh_children) { + op_errno = ENOMEM; + goto out; + } - if ((read_child >= 0) && (priv->child_up[read_child])) { - call_child = read_child; + read_child = afr_inode_get_read_ctx (this, fd->inode, local->fresh_children); + ret = afr_get_call_child (this, local->child_up, read_child, + local->fresh_children, + &call_child, + &local->cont.readv.last_index); + if (ret < 0) { + op_errno = -ret; + goto out; + } - /* - if read fails from the read child, we try - all children starting with the first one - */ - local->cont.readv.last_tried = -1; + local->fd = fd_ref (fd); - } else { - call_child = afr_first_up_child (priv); - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_DEBUG, - "no child is up"); - goto out; - } - - local->cont.readv.last_tried = call_child; - } - - local->fd = fd_ref (fd); - - local->cont.readv.ino = fd->inode->ino; - local->cont.readv.size = size; - local->cont.readv.offset = offset; - - STACK_WIND_COOKIE (frame, afr_readv_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readv, - fd, size, offset); - - op_ret = 0; + local->cont.readv.size = size; + local->cont.readv.offset = offset; + local->cont.readv.flags = flags; + + afr_open_fd_fix (fd, this); + + STACK_WIND_COOKIE (frame, afr_readv_cbk, + (void *) (long) call_child, + children[call_child], + children[call_child]->fops->readv, + fd, size, offset, flags, xdata); + + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, NULL, 0, NULL, - NULL); - } - return 0; + if (ret < 0) { + AFR_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, + NULL, NULL); + } + return 0; } /* }}} */ diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h index 9b29d5055..e4091a793 100644 --- a/xlators/cluster/afr/src/afr-inode-read.h +++ b/xlators/cluster/afr/src/afr-inode-read.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __INODE_READ_H__ @@ -22,26 +13,30 @@ int32_t afr_access (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t mask); + loc_t *loc, int32_t mask, dict_t *xdata); int32_t afr_stat (call_frame_t *frame, xlator_t *this, - loc_t *loc); + loc_t *loc, dict_t *xdata); int32_t afr_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd); + fd_t *fd, dict_t *xdata); int32_t afr_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size); + loc_t *loc, size_t size, dict_t *xdata); int32_t afr_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset); + fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata); int32_t afr_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name); + loc_t *loc, const char *name, dict_t *xdata); + +int32_t +afr_fgetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata); #endif /* __INODE_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 2eaf99c55..c1ec69a55 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -46,52 +37,155 @@ #include "afr.h" #include "afr-transaction.h" +#include "afr-self-heal-common.h" + +void +__inode_write_fop_cbk (call_frame_t *frame, int child_index, int read_child, + xlator_t *this, int32_t *op_ret, int32_t *op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (afr_fop_failed (*op_ret, *op_errno)) { + local->child_errno[child_index] = *op_errno; + + switch (local->op) { + case GF_FOP_TRUNCATE: + case GF_FOP_FTRUNCATE: + if (*op_errno != EFBIG) + afr_transaction_fop_failed (frame, this, + child_index); + break; + default: + afr_transaction_fop_failed (frame, this, child_index); + break; + } + local->op_errno = *op_errno; + goto out; + } + + if ((local->success_count == 0) || (read_child == child_index)) { + local->op_ret = *op_ret; + if (prebuf) + local->cont.inode_wfop.prebuf = *prebuf; + if (postbuf) + local->cont.inode_wfop.postbuf = *postbuf; + } + + local->success_count++; +out: + return; +} /* {{{ writev */ -int +void +afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame) +{ + afr_local_t *src_local = NULL; + afr_local_t *dst_local = NULL; + + src_local = src_frame->local; + dst_local = dst_frame->local; + + dst_local->op_ret = src_local->op_ret; + dst_local->op_errno = src_local->op_errno; + dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf; + dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf; +} + +void afr_writev_unwind (call_frame_t *frame, xlator_t *this) { afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + local = frame->local; + + AFR_STACK_UNWIND (writev, frame, + local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); +} + +call_frame_t* +afr_transaction_detach_fop_frame (call_frame_t *frame) +{ + afr_local_t * local = NULL; + call_frame_t *fop_frame = NULL; local = frame->local; LOCK (&frame->lock); { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; + fop_frame = local->transaction.main_frame; local->transaction.main_frame = NULL; } UNLOCK (&frame->lock); - if (main_frame) { - local->cont.writev.prebuf.ia_ino = local->cont.writev.ino; - local->cont.writev.postbuf.ia_ino = local->cont.writev.ino; + return fop_frame; +} - AFR_STACK_UNWIND (writev, main_frame, - local->op_ret, local->op_errno, - &local->cont.writev.prebuf, - &local->cont.writev.postbuf); +int +afr_transaction_writev_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *fop_frame = NULL; + + fop_frame = afr_transaction_detach_fop_frame (frame); + + if (fop_frame) { + afr_writev_copy_outvars (frame, fop_frame); + afr_writev_unwind (fop_frame, this); } return 0; } +static void +afr_writev_handle_short_writes (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + /* + * We already have the best case result of the writev calls staged + * as the return value. Any writev that returns some value less + * than the best case is now out of sync, so mark the fop as + * failed. Note that fops that have returned with errors have + * already been marked as failed. + */ + for (i = 0; i < priv->child_count; i++) { + if ((!local->replies[i].valid) || + (local->replies[i].op_ret == -1)) + continue; + + if (local->replies[i].op_ret < local->op_ret) + afr_transaction_fop_failed(frame, this, i); + } +} int afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = NULL; - + afr_private_t *priv = NULL; + call_frame_t *fop_frame = NULL; int child_index = (long) cookie; int call_count = -1; int read_child = 0; + int ret = 0; + uint32_t open_fd_count = 0; + uint32_t write_is_append = 0; local = frame->local; + priv = this->private; - read_child = afr_read_child (this, local->fd->inode); + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); LOCK (&frame->lock); { @@ -99,50 +193,100 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.writev.prebuf = *prebuf; - local->cont.writev.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.writev.prebuf = *prebuf; - local->cont.writev.postbuf = *postbuf; + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + + + /* stage the best case return value for unwind */ + if ((local->success_count == 0) || (op_ret > local->op_ret)) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + + if (op_ret != -1) { + if (xdata) { + ret = dict_get_uint32 (xdata, + GLUSTERFS_OPEN_FD_COUNT, + &open_fd_count); + if ((ret == 0) && + (open_fd_count > local->open_fd_count)) { + local->open_fd_count = open_fd_count; + local->update_open_fd_count = _gf_true; + } + + write_is_append = 0; + ret = dict_get_uint32 (xdata, + GLUSTERFS_WRITE_IS_APPEND, + &write_is_append); + if (ret || !write_is_append) + local->append_write = _gf_false; } - } - local->op_errno = op_errno; + } } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) { - local->transaction.unwind (frame, this); - local->transaction.resume (frame, this); + if (local->update_open_fd_count) + afr_handle_open_fd_count (frame, this); + + if (!local->stable_write && !local->append_write) + /* An appended write removes the necessity to + fsync() the file. This is because self-heal + has the logic to check for larger file when + the xattrs are not reliably pointing at + a stale file. + */ + afr_fd_report_unstable_write (this, local->fd); + + afr_writev_handle_short_writes (frame, this); + if (afr_any_fops_failed (local, priv)) { + //Don't unwind until post-op is complete + local->transaction.resume (frame, this); + } else { + /* + * Generally inode-write fops do transaction.unwind then + * transaction.resume, but writev needs to make sure that + * delayed post-op frame is placed in fdctx before unwind + * happens. This prevents the race of flush doing the + * changelog wakeup first in fuse thread and then this + * writev placing its delayed post-op frame in fdctx. + * This helps flush make sure all the delayed post-ops are + * completed. + */ + + fop_frame = afr_transaction_detach_fop_frame (frame); + afr_writev_copy_outvars (frame, fop_frame); + local->transaction.resume (frame, this); + afr_writev_unwind (fop_frame, this); + } } return 0; } - int afr_writev_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int i = 0; int call_count = -1; + dict_t *xdata = NULL; + GF_UNUSED int ret = 0; local = frame->local; priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -150,9 +294,31 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this) } local->call_count = call_count; + local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), + gf_afr_mt_reply_t); + if (!local->replies) { + local->op_ret = -1; + local->op_errno = ENOMEM; + local->transaction.unwind(frame, this); + local->transaction.resume(frame, this); + return 0; + } + + xdata = dict_new (); + if (xdata) { + ret = dict_set_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, + sizeof (uint32_t)); + ret = dict_set_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND, + 0); + /* Set append_write to be true speculatively. If on any + server it turns not be true, we unset it in the + callback. + */ + local->append_write = _gf_true; + } for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, (void *) (long) i, priv->children[i], @@ -161,13 +327,18 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this) local->cont.writev.vector, local->cont.writev.count, local->cont.writev.offset, - local->cont.writev.iobref); + local->cont.writev.flags, + local->cont.writev.iobref, + xdata); if (!--call_count) break; } } + if (xdata) + dict_unref (xdata); + return 0; } @@ -193,24 +364,21 @@ afr_writev_done (call_frame_t *frame, xlator_t *this) int afr_do_writev (call_frame_t *frame, xlator_t *this) { - call_frame_t * transaction_frame = NULL; - afr_local_t * local = NULL; - - int op_ret = -1; - int op_errno = 0; + call_frame_t *transaction_frame = NULL; + afr_local_t *local = NULL; + int op_ret = -1; + int op_errno = 0; local = frame->local; transaction_frame = copy_frame (frame); if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); op_errno = ENOMEM; goto out; } transaction_frame->local = local; - frame->local = NULL; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local->op = GF_FOP_WRITE; @@ -218,10 +386,17 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) local->transaction.fop = afr_writev_wind; local->transaction.done = afr_writev_done; - local->transaction.unwind = afr_writev_unwind; + local->transaction.unwind = afr_transaction_writev_unwind; local->transaction.main_frame = frame; if (local->fd->flags & O_APPEND) { + /* + * Backend vfs ignores the 'offset' for append mode fd so + * locking just the region provided for the writev does not + * give consistency gurantee. The actual write may happen at a + * completely different range than the one provided by the + * offset, len in the fop. So lock the entire file. + */ local->transaction.start = 0; local->transaction.len = 0; } else { @@ -230,79 +405,179 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) local->cont.writev.count); } - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } op_ret = 0; out: - if (op_ret == -1) { + if (op_ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL); + AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL, NULL); } return 0; } +static void +afr_trigger_open_fd_self_heal (fd_t *fd, xlator_t *this) +{ + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + char *reason = NULL; + int32_t op_errno = 0; + int ret = 0; + + if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid args: " + "fd: %p, inode: %p", fd, + fd ? fd->inode : NULL); + goto out; + } + + frame = create_frame (this, this->ctx->pool); + if (!frame) + goto out; + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + ret = afr_local_init (local, this->private, &op_errno); + if (ret < 0) + goto out; + + local->loc.inode = inode_ref (fd->inode); + ret = loc_path (&local->loc, NULL); + if (ret < 0) + goto out; + + sh = &local->self_heal; + sh->do_metadata_self_heal = _gf_true; + if (fd->inode->ia_type == IA_IFREG) + sh->do_data_self_heal = _gf_true; + else if (fd->inode->ia_type == IA_IFDIR) + sh->do_entry_self_heal = _gf_true; + + reason = "subvolume came online"; + afr_launch_self_heal (frame, this, fd->inode, _gf_true, + fd->inode->ia_type, reason, NULL, NULL); + return; +out: + AFR_STACK_DESTROY (frame); +} + +void +afr_open_fd_fix (fd_t *fd, xlator_t *this) +{ + int ret = 0; + int i = 0; + afr_fd_ctx_t *fd_ctx = NULL; + gf_boolean_t need_self_heal = _gf_false; + int *need_open = NULL; + size_t need_open_count = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + if (!afr_is_fd_fixable (fd)) + goto out; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + goto out; + + LOCK (&fd->lock); + { + if (fd_ctx->up_count < priv->up_count) { + need_self_heal = _gf_true; + fd_ctx->up_count = priv->up_count; + fd_ctx->down_count = priv->down_count; + } + + need_open = alloca (priv->child_count * sizeof (*need_open)); + for (i = 0; i < priv->child_count; i++) { + need_open[i] = 0; + if (fd_ctx->opened_on[i] != AFR_FD_NOT_OPENED) + continue; + + if (!priv->child_up[i]) + continue; + + fd_ctx->opened_on[i] = AFR_FD_OPENING; + + need_open[i] = 1; + need_open_count++; + } + } + UNLOCK (&fd->lock); + if (ret) + goto out; + + if (need_self_heal) + afr_trigger_open_fd_self_heal (fd, this); + + if (!need_open_count) + goto out; + + afr_fix_open (this, fd, need_open_count, need_open); +out: + return; +} int afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref) + uint32_t flags, struct iobref *iobref, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; - int ret = -1; - - int op_ret = -1; int op_errno = 0; - uint64_t ctx; - afr_fd_ctx_t *fd_ctx = NULL; - VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (this->private, out); priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; goto out; } - frame->local = local; + QUORUM_CHECK(writev,out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; local->cont.writev.vector = iov_dup (vector, count); local->cont.writev.count = count; local->cont.writev.offset = offset; - local->cont.writev.ino = fd->inode->ino; + local->cont.writev.flags = flags; local->cont.writev.iobref = iobref_ref (iobref); local->fd = fd_ref (fd); - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - goto out; - } + /* detect here, but set it in writev_wind_cbk *after* the unstable + write is performed + */ + local->stable_write = !!((fd->flags|flags)&(O_SYNC|O_DSYNC)); - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + afr_open_fd_fix (fd, this); - if (fd_ctx->up_count < priv->up_count) { - local->openfd_flush_cbk = afr_do_writev; - afr_openfd_flush (frame, this, fd); - } else { - afr_do_writev (frame, this); - } + afr_do_writev (frame, this); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -329,13 +604,11 @@ afr_truncate_unwind (call_frame_t *frame, xlator_t *this) UNLOCK (&frame->lock); if (main_frame) { - local->cont.truncate.prebuf.ia_ino = local->cont.truncate.ino; - local->cont.truncate.postbuf.ia_ino = local->cont.truncate.ino; - AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, local->op_errno, - &local->cont.truncate.prebuf, - &local->cont.truncate.postbuf); + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); } return 0; @@ -345,20 +618,16 @@ afr_truncate_unwind (call_frame_t *frame, xlator_t *this) int afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; int read_child = 0; int call_count = -1; - int need_unwind = 0; local = frame->local; - priv = this->private; - read_child = afr_read_child (this, local->loc.inode); + read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); LOCK (&frame->lock); { @@ -366,38 +635,22 @@ afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno) && op_errno != EFBIG) - afr_transaction_fop_failed (frame, this, child_index); - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.truncate.prebuf = *prebuf; - local->cont.truncate.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.truncate.prebuf = *prebuf; - local->cont.truncate.postbuf = *postbuf; - } - - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if (prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; } - local->op_errno = op_errno; + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); } UNLOCK (&frame->lock); - if (need_unwind) - local->transaction.unwind (frame, this); - call_count = afr_frame_return (frame); if (call_count == 0) { + if (local->stable_write && afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); + local->transaction.resume (frame, this); } @@ -410,14 +663,14 @@ afr_truncate_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; int i = 0; local = frame->local; priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -425,15 +678,17 @@ afr_truncate_wind (call_frame_t *frame, xlator_t *this) } local->call_count = call_count; + local->stable_write = _gf_true; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->truncate, &local->loc, - local->cont.truncate.offset); + local->cont.truncate.offset, + NULL); if (!--call_count) break; @@ -461,15 +716,12 @@ afr_truncate_done (call_frame_t *frame, xlator_t *this) int afr_truncate (call_frame_t *frame, xlator_t *this, - loc_t *loc, off_t offset) + loc_t *loc, off_t offset, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; call_frame_t *transaction_frame = NULL; - int ret = -1; - - int op_ret = -1; int op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -478,27 +730,22 @@ afr_truncate (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(truncate,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; - - local->op_ret = -1; local->cont.truncate.offset = offset; - local->cont.truncate.ino = loc->inode->ino; local->transaction.fop = afr_truncate_wind; local->transaction.done = afr_truncate_done; @@ -507,17 +754,21 @@ afr_truncate (call_frame_t *frame, xlator_t *this, loc_copy (&local->loc, loc); local->transaction.main_frame = frame; - local->transaction.start = 0; - local->transaction.len = offset; + local->transaction.start = offset; + local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (truncate, frame, op_ret, op_errno, NULL, NULL); + AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); } return 0; @@ -546,13 +797,11 @@ afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) UNLOCK (&frame->lock); if (main_frame) { - local->cont.ftruncate.prebuf.ia_ino = local->cont.ftruncate.ino; - local->cont.ftruncate.postbuf.ia_ino = local->cont.ftruncate.ino; - AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, local->op_errno, - &local->cont.ftruncate.prebuf, - &local->cont.ftruncate.postbuf); + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); } return 0; } @@ -561,20 +810,16 @@ afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) int afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; int call_count = -1; - int need_unwind = 0; int read_child = 0; local = frame->local; - priv = this->private; - read_child = afr_read_child (this, local->fd->inode); + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); LOCK (&frame->lock); { @@ -582,38 +827,22 @@ afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.ftruncate.prebuf = *prebuf; - local->cont.ftruncate.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.ftruncate.prebuf = *prebuf; - local->cont.ftruncate.postbuf = *postbuf; - } - - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if (prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; } - local->op_errno = op_errno; + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); } UNLOCK (&frame->lock); - if (need_unwind) - local->transaction.unwind (frame, this); - call_count = afr_frame_return (frame); if (call_count == 0) { + if (local->stable_write && afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); + local->transaction.resume (frame, this); } @@ -626,14 +855,14 @@ afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; int i = 0; local = frame->local; priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -641,14 +870,17 @@ afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) } local->call_count = call_count; + local->stable_write = _gf_true; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->ftruncate, - local->fd, local->cont.ftruncate.offset); + local->fd, + local->cont.ftruncate.offset, + NULL); if (!--call_count) break; @@ -679,7 +911,6 @@ afr_do_ftruncate (call_frame_t *frame, xlator_t *this) { call_frame_t * transaction_frame = NULL; afr_local_t * local = NULL; - int op_ret = -1; int op_errno = 0; @@ -687,8 +918,6 @@ afr_do_ftruncate (call_frame_t *frame, xlator_t *this) transaction_frame = copy_frame (frame); if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); goto out; } @@ -703,17 +932,22 @@ afr_do_ftruncate (call_frame_t *frame, xlator_t *this) local->transaction.main_frame = frame; - local->transaction.start = 0; - local->transaction.len = local->cont.ftruncate.offset; + local->transaction.start = local->cont.ftruncate.offset; + local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } op_ret = 0; out: - if (op_ret == -1) { + if (op_ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, NULL); + AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, + NULL, NULL); } return 0; @@ -722,61 +956,47 @@ out: int afr_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset) + fd_t *fd, off_t offset, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; call_frame_t *transaction_frame = NULL; - int ret = -1; - - int op_ret = -1; int op_errno = 0; - uint64_t ctx; - afr_fd_ctx_t *fd_ctx = NULL; - VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (this->private, out); priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; goto out; } + QUORUM_CHECK(ftruncate,out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - frame->local = local; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; local->cont.ftruncate.offset = offset; - local->cont.ftruncate.ino = fd->inode->ino; local->fd = fd_ref (fd); - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - goto out; - } + afr_open_fd_fix (fd, this); - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + afr_do_ftruncate (frame, this); - if (fd_ctx->up_count < priv->up_count) { - local->openfd_flush_cbk = afr_do_ftruncate; - afr_openfd_flush (frame, this, fd); - } else { - afr_do_ftruncate (frame, this); - } - - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, NULL); + AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); } return 0; @@ -803,13 +1023,11 @@ afr_setattr_unwind (call_frame_t *frame, xlator_t *this) UNLOCK (&frame->lock); if (main_frame) { - local->cont.setattr.preop_buf.ia_ino = local->cont.setattr.ino; - local->cont.setattr.postop_buf.ia_ino = local->cont.setattr.ino; - AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, local->op_errno, - &local->cont.setattr.preop_buf, - &local->cont.setattr.postop_buf); + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); } return 0; @@ -819,11 +1037,10 @@ afr_setattr_unwind (call_frame_t *frame, xlator_t *this) int afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) + struct iatt *preop, struct iatt *postop, dict_t *xdata) { afr_local_t * local = NULL; afr_private_t * priv = NULL; - int child_index = (long) cookie; int read_child = 0; int call_count = -1; @@ -832,7 +1049,7 @@ afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; priv = this->private; - read_child = afr_read_child (this, local->loc.inode); + read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); LOCK (&frame->lock); { @@ -840,29 +1057,14 @@ afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.setattr.preop_buf = *preop; - local->cont.setattr.postop_buf = *postop; - } - - if (child_index == read_child) { - local->cont.setattr.preop_buf = *preop; - local->cont.setattr.postop_buf = *postop; - } - - local->success_count++; + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, preop, postop, + xdata); - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; } - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -884,14 +1086,14 @@ afr_setattr_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; int i = 0; local = frame->local; priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -901,14 +1103,15 @@ afr_setattr_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->setattr, &local->loc, &local->cont.setattr.in_buf, - local->cont.setattr.valid); + local->cont.setattr.valid, + NULL); if (!--call_count) break; @@ -936,15 +1139,12 @@ afr_setattr_done (call_frame_t *frame, xlator_t *this) int afr_setattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct iatt *buf, int32_t valid) + loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; call_frame_t *transaction_frame = NULL; - int ret = -1; - - int op_ret = -1; int op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -953,26 +1153,20 @@ afr_setattr (call_frame_t *frame, xlator_t *this, priv = this->private; + QUORUM_CHECK(setattr,out); + transaction_frame = copy_frame (frame); if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); + op_errno = ENOMEM; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - transaction_frame->local = local; - - local->op_ret = -1; - - local->cont.setattr.ino = loc->inode->ino; local->cont.setattr.in_buf = *buf; local->cont.setattr.valid = valid; @@ -987,14 +1181,18 @@ afr_setattr (call_frame_t *frame, xlator_t *this, local->transaction.start = LLONG_MAX - 1; local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setattr, frame, op_ret, op_errno, NULL, NULL); + AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); } return 0; @@ -1019,15 +1217,11 @@ afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this) UNLOCK (&frame->lock); if (main_frame) { - local->cont.fsetattr.preop_buf.ia_ino = - local->cont.fsetattr.ino; - local->cont.fsetattr.postop_buf.ia_ino = - local->cont.fsetattr.ino; - AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, local->op_errno, - &local->cont.fsetattr.preop_buf, - &local->cont.fsetattr.postop_buf); + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); } return 0; @@ -1036,12 +1230,11 @@ afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this) int afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata) { afr_local_t * local = NULL; afr_private_t * priv = NULL; - int child_index = (long) cookie; int read_child = 0; int call_count = -1; @@ -1050,7 +1243,7 @@ afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; priv = this->private; - read_child = afr_read_child (this, local->fd->inode); + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); LOCK (&frame->lock); { @@ -1058,29 +1251,14 @@ afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.fsetattr.preop_buf = *preop; - local->cont.fsetattr.postop_buf = *postop; - } - - if (child_index == read_child) { - local->cont.fsetattr.preop_buf = *preop; - local->cont.fsetattr.postop_buf = *postop; - } + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, preop, postop, + xdata); - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; } - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1102,14 +1280,14 @@ afr_fsetattr_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; int i = 0; local = frame->local; priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -1119,14 +1297,15 @@ afr_fsetattr_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->fsetattr, local->fd, &local->cont.fsetattr.in_buf, - local->cont.fsetattr.valid); + local->cont.fsetattr.valid, + NULL); if (!--call_count) break; @@ -1151,18 +1330,14 @@ afr_fsetattr_done (call_frame_t *frame, xlator_t *this) return 0; } - int afr_fsetattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iatt *buf, int32_t valid) + fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; call_frame_t *transaction_frame = NULL; - int ret = -1; - - int op_ret = -1; int op_errno = 0; VALIDATE_OR_GOTO (frame, out); @@ -1171,26 +1346,25 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this, priv = this->private; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + QUORUM_CHECK(fsetattr,out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + op_errno = ENOMEM; goto out; } - transaction_frame->local = local; - - local->op_ret = -1; + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - local->cont.fsetattr.ino = fd->inode->ino; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; local->cont.fsetattr.in_buf = *buf; local->cont.fsetattr.valid = valid; @@ -1201,18 +1375,24 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this, local->fd = fd_ref (fd); + afr_open_fd_fix (fd, this); + local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, NULL, NULL); + AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); } return 0; @@ -1240,7 +1420,8 @@ afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (setxattr, main_frame, - local->op_ret, local->op_errno) + local->op_ret, local->op_errno, + NULL); } return 0; } @@ -1248,31 +1429,25 @@ afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) int afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int need_unwind = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; local = frame->local; priv = this->private; LOCK (&frame->lock); { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->child_count) { - need_unwind = 1; - } + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); + if (local->success_count == priv->child_count) { + need_unwind = 1; } - - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1292,16 +1467,16 @@ afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int afr_setxattr_wind (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; local = frame->local; priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -1311,14 +1486,15 @@ afr_setxattr_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->setxattr, &local->loc, local->cont.setxattr.dict, - local->cont.setxattr.flags); + local->cont.setxattr.flags, + NULL); if (!--call_count) break; @@ -1332,7 +1508,7 @@ afr_setxattr_wind (call_frame_t *frame, xlator_t *this) int afr_setxattr_done (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = frame->local; + afr_local_t *local = frame->local; local->transaction.unwind (frame, this); @@ -1343,41 +1519,40 @@ afr_setxattr_done (call_frame_t *frame, xlator_t *this) int afr_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags) + loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = EINVAL; - int ret = -1; + VALIDATE_OR_GOTO (this, out); - int op_ret = -1; - int op_errno = 0; + GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, + op_errno, out); + + GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, + op_errno, out); VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (this->private, out); priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - + QUORUM_CHECK(setxattr,out); transaction_frame = copy_frame (frame); if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); + op_errno = ENOMEM; goto out; } - transaction_frame->local = local; + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; - local->op_ret = -1; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; local->cont.setxattr.dict = dict_ref (dict); local->cont.setxattr.flags = flags; @@ -1392,14 +1567,211 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, local->transaction.start = LLONG_MAX - 1; local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + } + + return 0; +} + +/* {{{ fsetxattr */ + + +int +afr_fsetxattr_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (fsetxattr, main_frame, + local->op_ret, local->op_errno, + NULL); + } + return 0; +} + + +int +afr_fsetxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); + if (local->success_count == priv->child_count) { + need_unwind = 1; + } + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fsetxattr, + local->fd, + local->cont.fsetxattr.dict, + local->cont.fsetxattr.flags, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_fsetxattr_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + +int +afr_fsetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = EINVAL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, + op_errno, out); + + GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, + op_errno, out); + + priv = this->private; + + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + + QUORUM_CHECK(fsetxattr,out); + + AFR_LOCAL_ALLOC_OR_GOTO (local, out); + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + goto out; + } + + transaction_frame->local = local; + + local->op_ret = -1; + + local->cont.fsetxattr.dict = dict_ref (dict); + local->cont.fsetxattr.flags = flags; + + local->transaction.fop = afr_fsetxattr_wind; + local->transaction.done = afr_fsetxattr_done; + local->transaction.unwind = afr_fsetxattr_unwind; + + local->fd = fd_ref (fd); + + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; + + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + ret = 0; out: - if (op_ret == -1) { + if (ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno); + AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); } return 0; @@ -1407,6 +1779,7 @@ out: /* }}} */ + /* {{{ removexattr */ @@ -1428,7 +1801,8 @@ afr_removexattr_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (removexattr, main_frame, - local->op_ret, local->op_errno) + local->op_ret, local->op_errno, + NULL); } return 0; } @@ -1436,31 +1810,25 @@ afr_removexattr_unwind (call_frame_t *frame, xlator_t *this) int afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int need_unwind = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; local = frame->local; priv = this->private; LOCK (&frame->lock); { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); + if (local->success_count == priv->wait_count) { + need_unwind = 1; } - - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1482,14 +1850,14 @@ afr_removexattr_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; int i = 0; local = frame->local; priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -1499,13 +1867,14 @@ afr_removexattr_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->removexattr, &local->loc, - local->cont.removexattr.name); + local->cont.removexattr.name, + NULL); if (!--call_count) break; @@ -1531,34 +1900,227 @@ afr_removexattr_done (call_frame_t *frame, xlator_t *this) int afr_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) + loc_t *loc, const char *name, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (this, out); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", + name, op_errno, out); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", + name, op_errno, out); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + priv = this->private; + + QUORUM_CHECK(removexattr,out); + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + op_errno = ENOMEM; + goto out; + } + + AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); + local = transaction_frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + local->cont.removexattr.name = gf_strdup (name); + + local->transaction.fop = afr_removexattr_wind; + local->transaction.done = afr_removexattr_done; + local->transaction.unwind = afr_removexattr_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; + + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); + } + + return 0; +} + +/* ffremovexattr */ +int +afr_fremovexattr_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (fremovexattr, main_frame, + local->op_ret, local->op_errno, + NULL); + } + return 0; +} + + +int +afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fremovexattr, + local->fd, + local->cont.removexattr.name, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_fremovexattr_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_ret = -1; int op_errno = 0; - VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", + name, op_errno, out); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", + name, op_errno, out); + + VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); priv = this->private; + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + + QUORUM_CHECK(fremovexattr, out); transaction_frame = copy_frame (frame); if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); goto out; } - ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (local, out); - ret = AFR_LOCAL_INIT (local, priv); + ret = afr_local_init (local, priv, &op_errno); if (ret < 0) { op_errno = -ret; goto out; @@ -1570,25 +2132,730 @@ afr_removexattr (call_frame_t *frame, xlator_t *this, local->cont.removexattr.name = gf_strdup (name); - local->transaction.fop = afr_removexattr_wind; - local->transaction.done = afr_removexattr_done; - local->transaction.unwind = afr_removexattr_unwind; + local->transaction.fop = afr_fremovexattr_wind; + local->transaction.done = afr_fremovexattr_done; + local->transaction.unwind = afr_fremovexattr_unwind; - loc_copy (&local->loc, loc); + local->fd = fd_ref (fd); local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + op_ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + op_ret = 0; +out: + if (op_ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, NULL); + } + + return 0; +} + +static int +afr_fallocate_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, + local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); + } + return 0; +} + +static int +afr_fallocate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + int read_child = 0; + + local = frame->local; + priv = this->private; + + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + + LOCK (&frame->lock); + { + if (child_index == read_child) { + local->read_child_returned = _gf_true; + } + + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); + + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; + } + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + +static int +afr_fallocate_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fallocate, + local->fd, + local->cont.fallocate.mode, + local->cont.fallocate.offset, + local->cont.fallocate.len, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + +static int +afr_fallocate_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + +static int +afr_do_fallocate (call_frame_t *frame, xlator_t *this) +{ + call_frame_t * transaction_frame = NULL; + afr_local_t * local = NULL; + int op_ret = -1; + int op_errno = 0; + + local = frame->local; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + goto out; + } + + transaction_frame->local = local; + frame->local = NULL; + + local->op = GF_FOP_FALLOCATE; + + local->transaction.fop = afr_fallocate_wind; + local->transaction.done = afr_fallocate_done; + local->transaction.unwind = afr_fallocate_unwind; + + local->transaction.main_frame = frame; + + local->transaction.start = local->cont.fallocate.offset; + local->transaction.len = 0; + + /* fallocate can modify the file size */ + op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + op_ret = 0; +out: + if (op_ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fallocate, frame, op_ret, op_errno, NULL, + NULL, NULL); + } + + return 0; +} + +int +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + QUORUM_CHECK(fallocate,out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + local->cont.fallocate.mode = mode; + local->cont.fallocate.offset = offset; + local->cont.fallocate.len = len; + + local->fd = fd_ref (fd); + + afr_open_fd_fix (fd, this); + + afr_do_fallocate (frame, this); + + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ discard */ + +static int +afr_discard_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (discard, main_frame, local->op_ret, + local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); + } + return 0; +} + +static int +afr_discard_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + int read_child = 0; + + local = frame->local; + priv = this->private; + + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + + LOCK (&frame->lock); + { + if (child_index == read_child) { + local->read_child_returned = _gf_true; + } + + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); + + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; + } + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + +static int +afr_discard_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->discard, + local->fd, + local->cont.discard.offset, + local->cont.discard.len, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + +static int +afr_discard_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + +static int +afr_do_discard (call_frame_t *frame, xlator_t *this) +{ + call_frame_t * transaction_frame = NULL; + afr_local_t * local = NULL; + int op_ret = -1; + int op_errno = 0; + + local = frame->local; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + goto out; + } + + transaction_frame->local = local; + frame->local = NULL; + + local->op = GF_FOP_DISCARD; + + local->transaction.fop = afr_discard_wind; + local->transaction.done = afr_discard_done; + local->transaction.unwind = afr_discard_unwind; + + local->transaction.main_frame = frame; + + local->transaction.start = local->cont.discard.offset; + local->transaction.len = 0; + + op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } op_ret = 0; out: - if (op_ret == -1) { + if (op_ret < 0) { if (transaction_frame) AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (removexattr, frame, op_ret, op_errno); + AFR_STACK_UNWIND (discard, frame, op_ret, op_errno, NULL, + NULL, NULL); + } + + return 0; +} + +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + QUORUM_CHECK(discard, out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + local->cont.discard.offset = offset; + local->cont.discard.len = len; + + local->fd = fd_ref (fd); + + afr_open_fd_fix (fd, this); + + afr_do_discard(frame, this); + + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + } + + return 0; +} + + +/* {{{ zerofill */ + +static int +afr_zerofill_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (zerofill, main_frame, local->op_ret, + local->op_errno, + &local->cont.zerofill.prebuf, + &local->cont.zerofill.postbuf, + NULL); + } + return 0; +} + +static int +afr_zerofill_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + int read_child = 0; + + local = frame->local; + priv = this->private; + + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + + LOCK (&frame->lock); + { + if (child_index == read_child) { + local->read_child_returned = _gf_true; + } + + if (afr_fop_failed (op_ret, op_errno)) { + afr_transaction_fop_failed (frame, this, child_index); + } + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.zerofill.prebuf = *prebuf; + local->cont.zerofill.postbuf = *postbuf; + } + + if (child_index == read_child) { + local->cont.zerofill.prebuf = *prebuf; + local->cont.zerofill.postbuf = *postbuf; + } + + local->success_count++; + + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; + } + } + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) { + local->transaction.unwind (frame, this); + } + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + +static int +afr_zerofill_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->zerofill, + local->fd, + local->cont.zerofill.offset, + local->cont.zerofill.len, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + +static int +afr_zerofill_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + +static int +afr_do_zerofill(call_frame_t *frame, xlator_t *this) +{ + call_frame_t *transaction_frame = NULL; + afr_local_t *local = NULL; + int op_ret = -1; + int op_errno = 0; + + local = frame->local; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + goto out; + } + + transaction_frame->local = local; + frame->local = NULL; + + local->op = GF_FOP_ZEROFILL; + + local->transaction.fop = afr_zerofill_wind; + local->transaction.done = afr_zerofill_done; + local->transaction.unwind = afr_zerofill_unwind; + + local->transaction.main_frame = frame; + + local->transaction.start = local->cont.zerofill.offset; + local->transaction.len = 0; + + op_ret = afr_transaction (transaction_frame, this, + AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + op_ret = 0; +out: + if (op_ret < 0) { + if (transaction_frame) { + AFR_STACK_DESTROY (transaction_frame); + } + AFR_STACK_UNWIND (zerofill, frame, op_ret, op_errno, NULL, + NULL, NULL); } return 0; } + +int +afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + QUORUM_CHECK(zerofill, out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) { + goto out; + } + local->cont.zerofill.offset = offset; + local->cont.zerofill.len = len; + + local->fd = fd_ref (fd); + + afr_open_fd_fix (fd, this); + + afr_do_zerofill(frame, this); + + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) { + AFR_STACK_DESTROY (transaction_frame); + } + AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, + NULL, NULL); + } + + return 0; +} + +/* }}} */ + + diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h index 1c6b87a97..8e93ca44a 100644 --- a/xlators/cluster/afr/src/afr-inode-write.h +++ b/xlators/cluster/afr/src/afr-inode-write.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __INODE_WRITE_H__ @@ -22,51 +13,70 @@ int32_t afr_chmod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode); + loc_t *loc, mode_t mode, dict_t *xdata); int32_t afr_chown (call_frame_t *frame, xlator_t *this, - loc_t *loc, uid_t uid, gid_t gid); + loc_t *loc, uid_t uid, gid_t gid, dict_t *xdata); int afr_fchown (call_frame_t *frame, xlator_t *this, - fd_t *fd, uid_t uid, gid_t gid); + fd_t *fd, uid_t uid, gid_t gid, dict_t *xdata); int32_t afr_fchmod (call_frame_t *frame, xlator_t *this, - fd_t *fd, mode_t mode); + fd_t *fd, mode_t mode, dict_t *xdata); int32_t -afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, +afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref); + uint32_t flags, struct iobref *iobref, dict_t *xdata); int32_t afr_truncate (call_frame_t *frame, xlator_t *this, - loc_t *loc, off_t offset); + loc_t *loc, off_t offset, dict_t *xdata); int32_t afr_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset); + fd_t *fd, off_t offset, dict_t *xdata); int32_t afr_utimens (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct timespec tv[2]); + loc_t *loc, struct timespec tv[2], dict_t *xdata); int afr_setattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct iatt *buf, int32_t valid); + loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata); int afr_fsetattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iatt *buf, int32_t valid); + fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata); int32_t afr_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags); + loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata); + +int32_t +afr_fsetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata); int32_t afr_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name); + loc_t *loc, const char *name, dict_t *xdata); + +int32_t +afr_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata); +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata); + +int +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata); + +int +afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata); #endif /* __INODE_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index 69160d72f..060d78f35 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include "dict.h" @@ -31,8 +22,69 @@ #define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path */ #define LOCKED_LOWER 0x2 /* for lower path */ +#define AFR_TRACE_INODELK_IN(frame, this, params ...) \ + do { \ + afr_private_t *_priv = this->private; \ + if (!_priv->inodelk_trace) \ + break; \ + afr_trace_inodelk_in (frame, this, params); \ + } while (0); + +#define AFR_TRACE_INODELK_OUT(frame, this, params ...) \ + do { \ + afr_private_t *_priv = this->private; \ + if (!_priv->inodelk_trace) \ + break; \ + afr_trace_inodelk_out (frame, this, params); \ + } while (0); + +#define AFR_TRACE_ENTRYLK_IN(frame, this, params ...) \ + do { \ + afr_private_t *_priv = this->private; \ + if (!_priv->entrylk_trace) \ + break; \ + afr_trace_entrylk_in (frame, this, params); \ + } while (0); + +#define AFR_TRACE_ENTRYLK_OUT(frame, this, params ...) \ + do { \ + afr_private_t *_priv = this->private; \ + if (!_priv->entrylk_trace) \ + break; \ + afr_trace_entrylk_out (frame, this, params); \ + } while (0); + int -afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index); +afr_entry_lockee_cmp (const void *l1, const void *l2) +{ + const afr_entry_lockee_t *r1 = l1; + const afr_entry_lockee_t *r2 = l2; + int ret = 0; + uuid_t gfid1 = {0}; + uuid_t gfid2 = {0}; + + loc_gfid ((loc_t*)&r1->loc, gfid1); + loc_gfid ((loc_t*)&r2->loc, gfid2); + ret = uuid_compare (gfid1, gfid2); + /*Entrylks with NULL basename are the 'smallest'*/ + if (ret == 0) { + if (!r1->basename) + return -1; + if (!r2->basename) + return 1; + ret = strcmp (r1->basename, r2->basename); + } + + if (ret <= 0) + return -1; + else + return 1; +} + +int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index); + +static int +afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this); static uint64_t afr_lock_number = 1; @@ -57,14 +109,13 @@ afr_set_lock_number (call_frame_t *frame, xlator_t *this) } void -afr_set_lk_owner (call_frame_t *frame, xlator_t *this) +afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner) { - if (!frame->root->lk_owner) { - gf_log (this->name, GF_LOG_TRACE, - "Setting lk-owner=%llu", - (unsigned long long) (unsigned long)frame->root); - frame->root->lk_owner = (uint64_t) (unsigned long)frame->root; - } + gf_log (this->name, GF_LOG_TRACE, + "Setting lk-owner=%llu", + (unsigned long long) (unsigned long)lk_owner); + + set_lk_owner_from_ptr (&frame->root->lk_owner, lk_owner); } static int @@ -90,29 +141,19 @@ is_afr_lock_selfheal (afr_local_t *local) } int32_t -internal_lock_count (call_frame_t *frame, xlator_t *this, - afr_fd_ctx_t *fd_ctx) +internal_lock_count (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int32_t call_count = 0; int i = 0; local = frame->local; priv = this->private; - if (fd_ctx) { - GF_ASSERT (local->fd); - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && fd_ctx->opened_on[i]) - ++call_count; - } - } else { - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) - ++call_count; - } + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) + ++call_count; } return call_count; @@ -120,7 +161,7 @@ internal_lock_count (call_frame_t *frame, xlator_t *this, static void afr_print_inodelk (char *str, int size, int cmd, - struct gf_flock *flock, uint64_t owner) + struct gf_flock *flock, gf_lkowner_t *owner) { char *cmd_str = NULL; char *type_str = NULL; @@ -168,11 +209,11 @@ afr_print_inodelk (char *str, int size, int cmd, } snprintf (str, size, "lock=INODELK, cmd=%s, type=%s, " - "start=%llu, len=%llu, pid=%llu, lk-owner=%llu", + "start=%llu, len=%llu, pid=%llu, lk-owner=%s", cmd_str, type_str, (unsigned long long) flock->l_start, (unsigned long long) flock->l_len, (unsigned long long) flock->l_pid, - (unsigned long long) owner); + lkowner_utoa (owner)); } @@ -188,11 +229,11 @@ afr_print_lockee (char *str, int size, loc_t *loc, fd_t *fd, void afr_print_entrylk (char *str, int size, const char *basename, - uint64_t owner) + gf_lkowner_t *owner) { - snprintf (str, size, "Basename=%s, lk-owner=%llu", + snprintf (str, size, "Basename=%s, lk-owner=%s", basename ? basename : "<nul>", - (unsigned long long)owner); + lkowner_utoa (owner)); } static void @@ -246,27 +287,20 @@ afr_set_lock_call_type (afr_lock_call_type_t lock_call_type, } static void -afr_trace_inodelk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type, +afr_trace_inodelk_out (call_frame_t *frame, xlator_t *this, + afr_lock_call_type_t lock_call_type, afr_lock_op_type_t lk_op_type, struct gf_flock *flock, int op_ret, int op_errno, int32_t child_index) { - xlator_t *this = NULL; afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; - afr_private_t *priv = NULL; char lockee[256]; char lock_call_type_str[256]; char verdict[16]; - this = THIS; local = frame->local; int_lock = &local->internal_lock; - priv = this->private; - - if (!priv->inodelk_trace) { - return; - } afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); @@ -274,45 +308,37 @@ afr_trace_inodelk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type, afr_print_verdict (op_ret, op_errno, verdict); - gf_log (this->name, GF_LOG_NORMAL, - "[%s %s] [%s] Lockee={%s} Number={%llu}", + gf_log (this->name, GF_LOG_INFO, + "[%s %s] [%s] lk-owner=%s Lockee={%s} Number={%llu}", lock_call_type_str, lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", - verdict, - lockee, + verdict, lkowner_utoa (&frame->root->lk_owner), lockee, (unsigned long long) int_lock->lock_number); } static void -afr_trace_inodelk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type, +afr_trace_inodelk_in (call_frame_t *frame, xlator_t *this, + afr_lock_call_type_t lock_call_type, afr_lock_op_type_t lk_op_type, struct gf_flock *flock, int32_t cmd, int32_t child_index) { - xlator_t *this = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; char lock[256]; char lockee[256]; char lock_call_type_str[256]; - this = THIS; local = frame->local; int_lock = &local->internal_lock; - priv = this->private; - - if (!priv->inodelk_trace) { - return; - } - afr_print_inodelk (lock, 256, cmd, flock, frame->root->lk_owner); + afr_print_inodelk (lock, 256, cmd, flock, &frame->root->lk_owner); afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); - gf_log (this->name, GF_LOG_NORMAL, + gf_log (this->name, GF_LOG_INFO, "[%s %s] Lock={%s} Lockee={%s} Number={%llu}", lock_call_type_str, lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST", @@ -322,20 +348,21 @@ afr_trace_inodelk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type, } static void -afr_trace_entrylk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type, +afr_trace_entrylk_in (call_frame_t *frame, xlator_t *this, + afr_lock_call_type_t lock_call_type, afr_lock_op_type_t lk_op_type, const char *basename, - int32_t child_index) + int32_t cookie) { - xlator_t *this = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; afr_private_t *priv = NULL; + int child_index = 0; + int lockee_no = 0; char lock[256]; char lockee[256]; char lock_call_type_str[256]; - this = THIS; local = frame->local; int_lock = &local->internal_lock; priv = this->private; @@ -343,36 +370,41 @@ afr_trace_entrylk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type, if (!priv->entrylk_trace) { return; } + lockee_no = cookie / priv->child_count; + child_index = cookie % priv->child_count; - afr_print_entrylk (lock, 256, basename, frame->root->lk_owner); - afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); + afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); + afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, + child_index); afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); - gf_log (this->name, GF_LOG_NORMAL, - "[%s %s] Lock={%s} Lockee={%s} Number={%llu}", + gf_log (this->name, GF_LOG_INFO, + "[%s %s] Lock={%s} Lockee={%s} Number={%llu}, Cookie={%d}", lock_call_type_str, lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST", lock, lockee, - (unsigned long long) int_lock->lock_number); + (unsigned long long) int_lock->lock_number, + cookie); } static void -afr_trace_entrylk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type, - afr_lock_op_type_t lk_op_type, const char *basename, int op_ret, - int op_errno, int32_t child_index) +afr_trace_entrylk_out (call_frame_t *frame, xlator_t *this, + afr_lock_call_type_t lock_call_type, + afr_lock_op_type_t lk_op_type, const char *basename, + int op_ret, int op_errno, int32_t cookie) { - xlator_t *this = NULL; afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; + int lockee_no = 0; + int child_index = 0; char lock[256]; char lockee[256]; char lock_call_type_str[256]; char verdict[16]; - this = THIS; local = frame->local; int_lock = &local->internal_lock; priv = this->private; @@ -380,20 +412,25 @@ afr_trace_entrylk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type, if (!priv->entrylk_trace) { return; } + lockee_no = cookie / priv->child_count; + child_index = cookie % priv->child_count; - afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); + afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); + afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, + child_index); afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); afr_print_verdict (op_ret, op_errno, verdict); - gf_log (this->name, GF_LOG_NORMAL, - "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu}", + gf_log (this->name, GF_LOG_INFO, + "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu} Cookie={%d}", lock_call_type_str, lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", verdict, lock, lockee, - (unsigned long long) int_lock->lock_number); + (unsigned long long) int_lock->lock_number, + cookie); } @@ -431,8 +468,8 @@ is_afr_lock_transaction (afr_local_t *local) int ret = 0; switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: ret = 1; break; @@ -446,6 +483,47 @@ is_afr_lock_transaction (afr_local_t *local) return ret; } +int +afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local, + loc_t *loc, char *basename, int child_count) +{ + int ret = -1; + + loc_copy (&lockee->loc, loc); + lockee->basename = (basename)? gf_strdup (basename): NULL; + if (basename && !lockee->basename) + goto out; + + lockee->locked_count = 0; + lockee->locked_nodes = GF_CALLOC (child_count, + sizeof (*lockee->locked_nodes), + gf_afr_mt_afr_node_character); + + if (!lockee->locked_nodes) + goto out; + + ret = 0; +out: + return ret; + +} + +void +afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock) +{ + int i = 0; + + for (i = 0; i < int_lock->lockee_count; i++) { + loc_wipe (&int_lock->lockee[i].loc); + if (int_lock->lockee[i].basename) + GF_FREE (int_lock->lockee[i].basename); + if (int_lock->lockee[i].locked_nodes) + GF_FREE (int_lock->lockee[i].locked_nodes); + } + + return; +} + static int initialize_entrylk_variables (call_frame_t *frame, xlator_t *this) { @@ -463,8 +541,13 @@ initialize_entrylk_variables (call_frame_t *frame, xlator_t *this) int_lock->lock_op_ret = -1; int_lock->lock_op_errno = 0; - for (i = 0; i < priv->child_count; i++) { - int_lock->entry_locked_nodes[i] = 0; + for (i = 0; i < AFR_LOCKEE_COUNT_MAX; i++) { + if (!int_lock->lockee[i].locked_nodes) + break; + int_lock->lockee[i].locked_count = 0; + memset (int_lock->lockee[i].locked_nodes, 0, + sizeof (*int_lock->lockee[i].locked_nodes) * + priv->child_count); } return 0; @@ -476,20 +559,23 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; afr_private_t *priv = NULL; - - int i = 0; + afr_inodelk_t *inodelk = NULL; priv = this->private; local = frame->local; int_lock = &local->internal_lock; - int_lock->inodelk_lock_count = 0; - int_lock->lock_op_ret = -1; - int_lock->lock_op_errno = 0; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - for (i = 0; i < priv->child_count; i++) { - int_lock->inode_locked_nodes[i] = 0; - } + inodelk->lock_count = 0; + int_lock->lk_attempted_count = 0; + int_lock->lock_op_ret = -1; + int_lock->lock_op_errno = 0; + + memset (inodelk->locked_nodes, 0, + sizeof (*inodelk->locked_nodes) * priv->child_count); + memset (int_lock->locked_nodes, 0, + sizeof (*int_lock->locked_nodes) * priv->child_count); return 0; } @@ -497,24 +583,36 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this) loc_t * lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2) { - int ret = 0; + int ret = 0; - ret = strcmp (l1->path, l2->path); + ret = uuid_compare (l1->inode->gfid, l2->inode->gfid); - if (ret == 0) - ret = strcmp (b1, b2); + if (ret == 0) + ret = strcmp (b1, b2); - if (ret <= 0) - return l1; - else - return l2; + if (ret <= 0) + return l1; + else + return l2; +} + +int +afr_lockee_locked_nodes_count (afr_internal_lock_t *int_lock) +{ + int call_count = 0; + int i = 0; + + for (i = 0; i < int_lock->lockee_count; i++) + call_count += int_lock->lockee[i].locked_count; + + return call_count; } int afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) { - int i; + int i = 0; int call_count = 0; for (i = 0; i < child_count; i++) { @@ -528,44 +626,63 @@ afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) /* FIXME: What if UNLOCK fails */ static int32_t afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; - int call_count = 0; + int call_count = 0; - local = frame->local; + local = frame->local; int_lock = &local->internal_lock; - LOCK (&frame->lock); - { - call_count = --int_lock->lk_call_count; - } - UNLOCK (&frame->lock); + LOCK (&frame->lock); + { + call_count = --int_lock->lk_call_count; + } + UNLOCK (&frame->lock); - if (call_count == 0) { + if (call_count == 0) { gf_log (this->name, GF_LOG_TRACE, "All internal locks unlocked"); int_lock->lock_cbk (frame, this); } - return 0; + return 0; } static int32_t afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_trace_inodelk_out (frame, AFR_INODELK_TRANSACTION, + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; + int32_t child_index = (long)cookie; + afr_private_t *priv = NULL; + + local = frame->local; + int_lock = &local->internal_lock; + + AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION, AFR_UNLOCK_OP, NULL, op_ret, - op_errno, (long) cookie); + op_errno, child_index); + + priv = this->private; if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { - gf_log (this->name, GF_LOG_TRACE, - "Unlock failed for some reason"); + gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on subvolume %s " + "with lock owner %s", local->loc.path, + priv->children[child_index]->name, + lkowner_utoa (&frame->root->lk_owner)); } - afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno); + + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + inodelk->locked_nodes[child_index] &= LOCKED_NO; + if (local->transaction.eager_lock) + local->transaction.eager_lock[child_index] = 0; + + afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, xdata); return 0; @@ -575,22 +692,30 @@ static int afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; - - struct gf_flock flock; + struct gf_flock flock = {0,}; + struct gf_flock full_flock = {0,}; + struct gf_flock *flock_use = NULL; int call_count = 0; int i = 0; + int piggyback = 0; + afr_fd_ctx_t *fd_ctx = NULL; + local = frame->local; int_lock = &local->internal_lock; priv = this->private; - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; flock.l_type = F_UNLCK; - call_count = afr_locked_nodes_count (int_lock->inode_locked_nodes, + full_flock.l_type = F_UNLCK; + call_count = afr_locked_nodes_count (inodelk->locked_nodes, priv->child_count); int_lock->lk_call_count = call_count; @@ -602,55 +727,107 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) goto out; } + if (local->fd) + fd_ctx = afr_fd_ctx_get (local->fd, this); + for (i = 0; i < priv->child_count; i++) { - if (int_lock->inode_locked_nodes[i] & LOCKED_YES) { - if (local->fd) { - afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION, - AFR_UNLOCK_OP, &flock, F_SETLK, i); - - STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, - (void *) (long)i, - priv->children[i], - priv->children[i]->fops->finodelk, - this->name, local->fd, - F_SETLK, &flock); + if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES) + continue; - if (!--call_count) - break; + if (local->fd) { + flock_use = &flock; + if (!local->transaction.eager_lock[i]) { + goto wind; + } - } else { - afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION, - AFR_UNLOCK_OP, &flock, F_SETLK, i); + piggyback = 0; - STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, - (void *) (long)i, - priv->children[i], - priv->children[i]->fops->inodelk, - this->name, &local->loc, - F_SETLK, &flock); + LOCK (&local->fd->lock); + { + if (fd_ctx->lock_piggyback[i]) { + fd_ctx->lock_piggyback[i]--; + piggyback = 1; + } else { + fd_ctx->lock_acquired[i]--; + } + } + UNLOCK (&local->fd->lock); + if (piggyback) { + afr_unlock_inodelk_cbk (frame, (void *) (long) i, + this, 1, 0, NULL); if (!--call_count) break; - + continue; } - } + flock_use = &full_flock; + wind: + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_TRANSACTION, + AFR_UNLOCK_OP, flock_use, F_SETLK, + i); - } + STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, + (void *) (long)i, + priv->children[i], + priv->children[i]->fops->finodelk, + int_lock->domain, local->fd, + F_SETLK, flock_use, NULL); + + if (!--call_count) + break; + + } else { + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_TRANSACTION, + AFR_UNLOCK_OP, &flock, F_SETLK, i); + + STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, + (void *) (long)i, + priv->children[i], + priv->children[i]->fops->inodelk, + int_lock->domain, &local->loc, + F_SETLK, &flock, NULL); + if (!--call_count) + break; + } + } out: return 0; } static int32_t afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION, - AFR_UNLOCK_OP, NULL, op_ret, - op_errno, (long) cookie); + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; + int32_t child_index = 0; + int lockee_no = 0; + + priv = this->private; + lockee_no = (int)((long) cookie) / priv->child_count; + child_index = (int) ((long) cookie) % priv->child_count; + + local = frame->local; + int_lock = &local->internal_lock; + + AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, + AFR_UNLOCK_OP, + int_lock->lockee[lockee_no].basename, op_ret, + op_errno, (int) ((long)cookie)); + + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s: unlock failed on %d, reason: %s", + local->loc.path, child_index, strerror (op_errno)); + } - afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno); + int_lock->lockee[lockee_no].locked_nodes[child_index] &= LOCKED_NO; + afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, NULL); return 0; } @@ -658,25 +835,22 @@ afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, static int afr_unlock_entrylk (call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - const char *basename = NULL; - loc_t *loc = NULL; - - int call_count = 0; - int i = -1; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int index = 0; + int lockee_no = 0; + int copies = 0; + int i = -1; local = frame->local; int_lock = &local->internal_lock; priv = this->private; + copies = priv->child_count; - basename = int_lock->lk_basename; - if (int_lock->lk_loc) - loc = int_lock->lk_loc; + call_count = afr_lockee_locked_nodes_count (int_lock); - call_count = afr_locked_nodes_count (int_lock->entry_locked_nodes, - priv->child_count); int_lock->lk_call_count = call_count; if (!call_count){ @@ -686,18 +860,23 @@ afr_unlock_entrylk (call_frame_t *frame, xlator_t *this) goto out; } - for (i = 0; i < priv->child_count; i++) { - if (int_lock->entry_locked_nodes[i] & LOCKED_YES) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION, - AFR_UNLOCK_OP, basename, i); + for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) { + lockee_no = i / copies; + index = i % copies; + if (int_lock->lockee[lockee_no].locked_nodes[index] & LOCKED_YES) { + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, + AFR_UNLOCK_OP, + int_lock->lockee[lockee_no].basename, + i); STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - loc, basename, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + priv->children[index], + priv->children[index]->fops->entrylk, + int_lock->domain, + &int_lock->lockee[lockee_no].loc, + int_lock->lockee[lockee_no].basename, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); if (!--call_count) break; @@ -711,155 +890,85 @@ out: static int32_t afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int done = 0; - int child_index = (long) cookie; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/posix-locks xlator on server"); - local->op_ret = op_ret; - int_lock->lock_op_ret = op_ret; - done = 1; - } - - local->child_up[child_index] = 0; - local->op_errno = op_errno; - int_lock->lock_op_errno = op_errno; - } - } - UNLOCK (&frame->lock); - - if ((op_ret == -1) && - (op_errno == ENOSYS)) { - afr_unlock (frame, this); - } else { - if (op_ret == 0) { - int_lock->locked_nodes[child_index] - |= LOCKED_YES; - int_lock->lock_count++; - } - afr_lock_blocking (frame, this, child_index + 1); - } - - return 0; -} - -static int32_t -afr_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_trace_inodelk_out (frame, AFR_INODELK_TRANSACTION, - AFR_LOCK_OP, NULL, op_ret, - op_errno, (long) cookie); - - afr_lock_cbk (frame, cookie, this, op_ret, op_errno); - return 0; - -} - -static int32_t -afr_lock_lower_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - loc_t *lower = NULL; - loc_t *higher = NULL; - const char *lower_name = NULL; - const char *higher_name = NULL; - - int child_index = (long) cookie; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int cky = (long) cookie; + int child_index = 0; + int lockee_no = 0; priv = this->private; local = frame->local; int_lock = &local->internal_lock; + child_index = ((int)cky) % priv->child_count; + lockee_no = ((int)cky) / priv->child_count; + LOCK (&frame->lock); { if (op_ret == -1) { if (op_errno == ENOSYS) { /* return ENOTSUP */ - gf_log (this->name, GF_LOG_ERROR, "subvolume does not support locking. " - "please load features/posix-locks xlator on server"); - - local->op_ret = op_ret; + "please load features/locks xlator on server"); + local->op_ret = op_ret; + int_lock->lock_op_ret = op_ret; } - local->child_up[child_index] = 0; - local->op_errno = op_errno; + local->op_errno = op_errno; + int_lock->lock_op_errno = op_errno; } + + int_lock->lk_attempted_count++; } UNLOCK (&frame->lock); - if (op_ret != 0) { + if ((op_ret == -1) && + (op_errno == ENOSYS)) { afr_unlock (frame, this); - goto out; } else { - int_lock->lower_locked_nodes[child_index] |= LOCKED_LOWER; - int_lock->lock_count++; + if (op_ret == 0) { + if (local->transaction.type == AFR_ENTRY_TRANSACTION || + local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + int_lock->lockee[lockee_no].locked_nodes[child_index] |= LOCKED_YES; + int_lock->lockee[lockee_no].locked_count++; + int_lock->entrylk_lock_count++; + } else { + int_lock->locked_nodes[child_index] |= LOCKED_YES; + int_lock->lock_count++; + } + } + afr_lock_blocking (frame, this, cky + 1); } - /* The lower path has been locked. Now lock the higher path */ - - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, higher_name, child_index); - + return 0; +} - STACK_WIND_COOKIE (frame, afr_lock_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - this->name, higher, higher_name, - ENTRYLK_LOCK, ENTRYLK_WRLCK); +static int32_t +afr_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION, + AFR_LOCK_OP, NULL, op_ret, + op_errno, (long) cookie); -out: + afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); return 0; + } static int32_t afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION, + AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, AFR_LOCK_OP, NULL, op_ret, op_errno, (long)cookie); - afr_lock_cbk (frame, cookie, this, op_ret, op_errno); + afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); return 0; } @@ -867,6 +976,7 @@ static int afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; @@ -875,20 +985,18 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - memcpy (int_lock->inode_locked_nodes, - int_lock->locked_nodes, - priv->child_count); - int_lock->inodelk_lock_count = int_lock->lock_count; + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + memcpy (inodelk->locked_nodes, int_lock->locked_nodes, + sizeof (*inodelk->locked_nodes) * priv->child_count); + inodelk->lock_count = int_lock->lock_count; break; case AFR_ENTRY_RENAME_TRANSACTION: case AFR_ENTRY_TRANSACTION: - memcpy (int_lock->entry_locked_nodes, - int_lock->locked_nodes, - priv->child_count); - int_lock->entrylk_lock_count = int_lock->lock_count; + /*entrylk_count is being used in both non-blocking and blocking + * modes */ break; } @@ -896,42 +1004,78 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) } +static inline gf_boolean_t +afr_is_entrylk (afr_internal_lock_t *int_lock, + afr_transaction_type trans_type) +{ + gf_boolean_t is_entrylk = _gf_false; + + if ((int_lock->transaction_lk_type == AFR_SELFHEAL_LK) && + int_lock->selfheal_lk_type == AFR_ENTRY_SELF_HEAL_LK) { + + is_entrylk = _gf_true; + + } else if ((int_lock->transaction_lk_type == AFR_TRANSACTION_LK) && + (trans_type == AFR_ENTRY_TRANSACTION || + trans_type == AFR_ENTRY_RENAME_TRANSACTION)) { + + is_entrylk = _gf_true; + + } else { + is_entrylk = _gf_false; + } + + return is_entrylk; +} + +static gf_boolean_t +_is_lock_wind_needed (afr_local_t *local, int child_index) +{ + if (!local->child_up[child_index]) + return _gf_false; + + return _gf_true; +} + int -afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) +afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - loc_t *lower = NULL; - loc_t *higher = NULL; - const char *lower_name = NULL; - const char *higher_name = NULL; - - struct gf_flock flock; - uint64_t ctx; + afr_private_t *priv = NULL; + struct gf_flock flock = {0,}; + uint64_t ctx = 0; int ret = 0; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; - flock.l_type = int_lock->lk_flock.l_type; + int child_index = 0; + int lockee_no = 0; + gf_boolean_t is_entrylk = _gf_false; + + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + child_index = cookie % priv->child_count; + lockee_no = cookie / priv->child_count; + is_entrylk = afr_is_entrylk (int_lock, local->transaction.type); + + + if (!is_entrylk) { + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; + flock.l_type = inodelk->flock.l_type; + } if (local->fd) { ret = fd_ctx_get (local->fd, this, &ctx); if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_INFO, "unable to get fd ctx for fd=%p", local->fd); local->op_ret = -1; int_lock->lock_op_ret = -1; - local->op_errno = EINVAL; - int_lock->lock_op_errno = EINVAL; afr_copy_locked_nodes (frame, this); @@ -939,49 +1083,27 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) return 0; } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - /* skip over children that or down - or don't have the fd open */ - - while ((child_index < priv->child_count) - && (!local->child_up[child_index] - || !fd_ctx->opened_on[child_index])) - - child_index++; - } else { - /* skip over children that are down */ - while ((child_index < priv->child_count) - && !local->child_up[child_index]) - child_index++; } - if ((child_index == priv->child_count) && - int_lock->lock_count == 0) { - - gf_log (this->name, GF_LOG_DEBUG, - "unable to lock on even one child"); + if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { + if ((is_entrylk && int_lock->entrylk_lock_count == 0) || + (!is_entrylk && int_lock->lock_count == 0)) { + gf_log (this->name, GF_LOG_INFO, + "unable to lock on even one child"); - local->op_ret = -1; - int_lock->lock_op_ret = -1; - local->op_errno = EAGAIN; - int_lock->lock_op_errno = EAGAIN; - - afr_copy_locked_nodes (frame, this); - - afr_unlock(frame, this); + local->op_ret = -1; + int_lock->lock_op_ret = -1; - return 0; + afr_copy_locked_nodes (frame, this); - } + afr_unlock(frame, this); - if ((child_index == priv->child_count) - || (int_lock->lock_count == - afr_up_children_count (priv->child_count, - local->child_up))) { + return 0; + } + } - /* we're done locking */ + if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { + /* we're done locking */ gf_log (this->name, GF_LOG_DEBUG, "we're done locking"); @@ -990,107 +1112,85 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) int_lock->lock_op_ret = 0; int_lock->lock_cbk (frame, this); - return 0; - } + return 0; + } - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: + if (!_is_lock_wind_needed (local, child_index)) { + afr_lock_blocking (frame, this, cookie + 1); + return 0; + } - if (local->fd) { - afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION, + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + + if (local->fd) { + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_TRANSACTION, AFR_LOCK_OP, &flock, F_SETLKW, child_index); - STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->finodelk, - this->name, local->fd, - F_SETLKW, &flock); + STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->finodelk, + int_lock->domain, local->fd, + F_SETLKW, &flock, NULL); - } else { - afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION, + } else { + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_TRANSACTION, AFR_LOCK_OP, &flock, F_SETLKW, child_index); - STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->inodelk, - this->name, &local->loc, - F_SETLKW, &flock); - } - - break; - - case AFR_ENTRY_RENAME_TRANSACTION: - { - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, lower_name, child_index); - - - STACK_WIND_COOKIE (frame, afr_lock_lower_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - this->name, lower, lower_name, - ENTRYLK_LOCK, ENTRYLK_WRLCK); - - break; - } + STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->inodelk, + int_lock->domain, &local->loc, + F_SETLKW, &flock, NULL); + } - case AFR_ENTRY_TRANSACTION: - if (local->fd) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, local->transaction.basename, - child_index); + break; - STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->fentrylk, - this->name, local->fd, - local->transaction.basename, - ENTRYLK_LOCK, ENTRYLK_WRLCK); - } else { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION, + case AFR_ENTRY_RENAME_TRANSACTION: + case AFR_ENTRY_TRANSACTION: + /*Accounting for child_index increments on 'down' + *and 'fd-less' children */ + + if (local->fd) { + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, + cookie); + + STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, + (void *) (long) cookie, + priv->children[child_index], + priv->children[child_index]->fops->fentrylk, + int_lock->domain, local->fd, + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + } else { + AFR_TRACE_ENTRYLK_IN (frame, this, + AFR_ENTRYLK_TRANSACTION, AFR_LOCK_OP, local->transaction.basename, child_index); - STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - this->name, - &local->transaction.parent_loc, - local->transaction.basename, - ENTRYLK_LOCK, ENTRYLK_WRLCK); - } - - break; - } - - return 0; + STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, + (void *) (long) cookie, + priv->children[child_index], + priv->children[child_index]->fops->entrylk, + int_lock->domain, + &int_lock->lockee[lockee_no].loc, + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + } + break; + } + return 0; } int32_t @@ -1099,19 +1199,25 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this) afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; + int up_count = 0; priv = this->private; local = frame->local; int_lock = &local->internal_lock; switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: initialize_inodelk_variables (frame, this); break; case AFR_ENTRY_RENAME_TRANSACTION: case AFR_ENTRY_TRANSACTION: + up_count = afr_up_children_count (local->child_up, + priv->child_count); + int_lock->lk_call_count = int_lock->lk_expected_count + = (int_lock->lockee_count * + up_count); initialize_entrylk_variables (frame, this); break; } @@ -1123,60 +1229,68 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this) static int32_t afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - + afr_local_t *local = NULL; int call_count = 0; - int child_index = (long) cookie; + int child_index = (long) cookie; + int copies = 0; + int index = 0; + int lockee_no = 0; + afr_private_t *priv = NULL; + + priv = this->private; - local = frame->local; + copies = priv->child_count; + index = child_index % copies; + lockee_no = child_index / copies; + + local = frame->local; int_lock = &local->internal_lock; - priv = this->private; - afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, NULL, op_ret, + AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, op_ret, op_errno, (long) cookie); - LOCK (&frame->lock); - { - call_count = --int_lock->lk_call_count; - } - UNLOCK (&frame->lock); - - if (op_ret < 0 ) { + LOCK (&frame->lock); + { + if (op_ret < 0 ) { if (op_errno == ENOSYS) { - /* return ENOTSUP */ + /* return ENOTSUP */ gf_log (this->name, GF_LOG_ERROR, "subvolume does not support locking. " - "please load features/posix-locks xlator on server"); + "please load features/locks xlator on server"); local->op_ret = op_ret; - int_lock->lock_op_ret = op_ret; + int_lock->lock_op_ret = op_ret; - local->child_up[child_index] = 0; - int_lock->lock_op_errno = op_errno; - local->op_errno = op_errno; - } - } else if (op_ret == 0) { - int_lock->entry_locked_nodes[child_index] - |= LOCKED_YES; - int_lock->entrylk_lock_count++; + int_lock->lock_op_errno = op_errno; + local->op_errno = op_errno; + } + } else if (op_ret == 0) { + int_lock->lockee[lockee_no].locked_nodes[index] |= \ + LOCKED_YES; + int_lock->lockee[lockee_no].locked_count++; + int_lock->entrylk_lock_count++; + } + + call_count = --int_lock->lk_call_count; } + UNLOCK (&frame->lock); if (call_count == 0) { gf_log (this->name, GF_LOG_TRACE, "Last locking reply received"); - /* all locks successfull. Proceed to call FOP */ + /* all locks successful. Proceed to call FOP */ if (int_lock->entrylk_lock_count == - afr_up_children_count (priv->child_count, local->child_up)) { + int_lock->lk_expected_count) { gf_log (this->name, GF_LOG_TRACE, "All servers locked. Calling the cbk"); int_lock->lock_op_ret = 0; int_lock->lock_cbk (frame, this); } - /* Not all locks were successfull. Unlock and try locking + /* Not all locks were successful. Unlock and try locking again, this time with serially blocking locks */ else { gf_log (this->name, GF_LOG_TRACE, @@ -1193,33 +1307,27 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - const char *basename = NULL; - loc_t *loc = NULL; - - int32_t call_count = 0; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int copies = 0; + int index = 0; + int lockee_no = 0; + int32_t call_count = 0; int i = 0; - uint64_t ctx; - int ret = 0; local = frame->local; int_lock = &local->internal_lock; priv = this->private; + copies = priv->child_count; initialize_entrylk_variables (frame, this); - basename = int_lock->lk_basename; - if (int_lock->lk_loc) - loc = int_lock->lk_loc; - if (local->fd) { - ret = fd_ctx_get (local->fd, this, &ctx); - - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, + fd_ctx = afr_fd_ctx_get (local->fd, this); + if (!fd_ctx) { + gf_log (this->name, GF_LOG_INFO, "unable to get fd ctx for fd=%p", local->fd); @@ -1228,113 +1336,147 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) local->op_errno = EINVAL; int_lock->lock_op_errno = EINVAL; + afr_unlock (frame, this); return -1; } - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - call_count = internal_lock_count (frame, this, fd_ctx); + call_count = int_lock->lockee_count * internal_lock_count (frame, this); int_lock->lk_call_count = call_count; + int_lock->lk_expected_count = call_count; + + if (!call_count) { + gf_log (this->name, GF_LOG_INFO, + "fd not open on any subvolumes. aborting."); + afr_unlock (frame, this); + goto out; + } /* Send non-blocking entrylk calls only on up children and where the fd has been opened */ - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && fd_ctx->opened_on[i]) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION, - AFR_LOCK_OP, basename, i); + for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) { + index = i%copies; + lockee_no = i/copies; + if (local->child_up[index]) { + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, + i); STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fentrylk, + priv->children[index], + priv->children[index]->fops->fentrylk, this->name, local->fd, - basename, - ENTRYLK_LOCK_NB, ENTRYLK_WRLCK); + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, + NULL); + if (!--call_count) + break; } } } else { - GF_ASSERT (loc); - - call_count = internal_lock_count (frame, this, NULL); + call_count = int_lock->lockee_count * internal_lock_count (frame, this); int_lock->lk_call_count = call_count; + int_lock->lk_expected_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION, - AFR_LOCK_OP, basename, i); + for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) { + index = i%copies; + lockee_no = i/copies; + if (local->child_up[index]) { + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, + i); STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, loc, basename, - ENTRYLK_LOCK, ENTRYLK_WRLCK); + priv->children[index], + priv->children[index]->fops->entrylk, + this->name, &int_lock->lockee[lockee_no].loc, + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, + NULL); if (!--call_count) break; - } } } - +out: return 0; } int32_t afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - + afr_inodelk_t *inodelk = NULL; + afr_local_t *local = NULL; int call_count = 0; - int child_index = (long) cookie; + int child_index = (long) cookie; + afr_fd_ctx_t *fd_ctx = NULL; - local = frame->local; + + local = frame->local; int_lock = &local->internal_lock; - priv = this->private; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - afr_trace_inodelk_out (frame, AFR_INODELK_NB_TRANSACTION, + AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_NB_TRANSACTION, AFR_LOCK_OP, NULL, op_ret, op_errno, (long) cookie); + if (local->fd) + fd_ctx = afr_fd_ctx_get (local->fd, this); + LOCK (&frame->lock); { - call_count = --int_lock->lk_call_count; - } - UNLOCK (&frame->lock); - - if (op_ret < 0 ) { + if (op_ret < 0) { if (op_errno == ENOSYS) { /* return ENOTSUP */ gf_log (this->name, GF_LOG_ERROR, "subvolume does not support locking. " - "please load features/posix-locks xlator on server"); + "please load features/locks xlator on " + "server"); local->op_ret = op_ret; - int_lock->lock_op_ret = op_ret; - local->child_up[child_index] = 0; - int_lock->lock_op_errno = op_errno; - local->op_errno = op_errno; - } - } else if (op_ret == 0) { - int_lock->inode_locked_nodes[child_index] - |= LOCKED_YES; - int_lock->inodelk_lock_count++; + int_lock->lock_op_ret = op_ret; + int_lock->lock_op_errno = op_errno; + local->op_errno = op_errno; + } + if (local->transaction.eager_lock) + local->transaction.eager_lock[child_index] = 0; + } else { + inodelk->locked_nodes[child_index] |= LOCKED_YES; + inodelk->lock_count++; + + if (local->transaction.eager_lock && + local->transaction.eager_lock[child_index] && + local->fd) { + /* piggybacked */ + if (op_ret == 1) { + /* piggybacked */ + } else if (op_ret == 0) { + /* lock acquired from server */ + fd_ctx->lock_acquired[child_index]++; + } + } + } + + call_count = --int_lock->lk_call_count; } + UNLOCK (&frame->lock); if (call_count == 0) { gf_log (this->name, GF_LOG_TRACE, "Last inode locking reply received"); - /* all locks successfull. Proceed to call FOP */ - if (int_lock->inodelk_lock_count == - afr_up_children_count (priv->child_count, local->child_up)) { + /* all locks successful. Proceed to call FOP */ + if (inodelk->lock_count == int_lock->lk_expected_count) { gf_log (this->name, GF_LOG_TRACE, "All servers locked. Calling the cbk"); int_lock->lock_op_ret = 0; int_lock->lock_cbk (frame, this); } - /* Not all locks were successfull. Unlock and try locking + /* Not all locks were successful. Unlock and try locking again, this time with serially blocking locks */ else { gf_log (this->name, GF_LOG_TRACE, @@ -1352,31 +1494,36 @@ int afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; afr_fd_ctx_t *fd_ctx = NULL; - - int32_t call_count = 0; - uint64_t ctx = 0; - int i = 0; - int ret = 0; - struct gf_flock flock; + int32_t call_count = 0; + int i = 0; + int ret = 0; + struct gf_flock flock = {0,}; + struct gf_flock full_flock = {0,}; + struct gf_flock *flock_use = NULL; + int piggyback = 0; local = frame->local; int_lock = &local->internal_lock; priv = this->private; - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; - flock.l_type = int_lock->lk_flock.l_type; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; + flock.l_type = inodelk->flock.l_type; + + full_flock.l_type = inodelk->flock.l_type; initialize_inodelk_variables (frame, this); if (local->fd) { - ret = fd_ctx_get (local->fd, this, &ctx); - - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, + fd_ctx = afr_fd_ctx_get (local->fd, this); + if (!fd_ctx) { + gf_log (this->name, GF_LOG_INFO, "unable to get fd ctx for fd=%p", local->fd); @@ -1385,273 +1532,96 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) local->op_errno = EINVAL; int_lock->lock_op_errno = EINVAL; + afr_unlock (frame, this); ret = -1; goto out; } - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - call_count = internal_lock_count (frame, this, fd_ctx); + call_count = internal_lock_count (frame, this); int_lock->lk_call_count = call_count; + int_lock->lk_expected_count = call_count; + + if (!call_count) { + gf_log (this->name, GF_LOG_INFO, + "fd not open on any subvolumes. aborting."); + afr_unlock (frame, this); + goto out; + } /* Send non-blocking inodelk calls only on up children and where the fd has been opened */ for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && fd_ctx->opened_on[i]) { - afr_trace_inodelk_in (frame, AFR_INODELK_NB_TRANSACTION, - AFR_LOCK_OP, &flock, F_SETLK, i); - - STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->finodelk, - this->name, local->fd, - F_SETLK, &flock); - - if (!--call_count) - break; + if (!local->child_up[i]) + continue; + flock_use = &flock; + if (!local->transaction.eager_lock_on) { + goto wind; } - } - } else { - call_count = internal_lock_count (frame, this, NULL); - int_lock->lk_call_count = call_count; + piggyback = 0; + local->transaction.eager_lock[i] = 1; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - afr_trace_inodelk_in (frame, AFR_INODELK_NB_TRANSACTION, - AFR_LOCK_OP, &flock, F_SETLK, i); + afr_set_delayed_post_op (frame, this); - STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->inodelk, - this->name, &local->loc, - F_SETLK, &flock); + LOCK (&local->fd->lock); + { + if (fd_ctx->lock_acquired[i]) { + fd_ctx->lock_piggyback[i]++; + piggyback = 1; + } + } + UNLOCK (&local->fd->lock); + if (piggyback) { + /* (op_ret == 1) => indicate piggybacked lock */ + afr_nonblocking_inodelk_cbk (frame, (void *) (long) i, + this, 1, 0, NULL); if (!--call_count) break; - + continue; } - } - } - -out: - return ret; -} - -static int -__is_lower_locked (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int count = 0; - int i = 0; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (int_lock->lower_locked_nodes[i] & LOCKED_LOWER) - count++; - } - - return count; - -} - -static int -__is_higher_locked (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - int count = 0; - int i = 0; + flock_use = &full_flock; + wind: + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_NB_TRANSACTION, + AFR_LOCK_OP, flock_use, F_SETLK, i); - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (int_lock->locked_nodes[i] & LOCKED_YES) - count++; - } - - return count; - -} - -static int -afr_unlock_lower_entrylk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - const char *basename = NULL; - loc_t *loc = NULL; - - int call_count = 0; - int i = -1; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - basename = int_lock->lk_basename; - if (int_lock->lk_loc) - loc = int_lock->lk_loc; - - call_count = __is_lower_locked (frame, this); - int_lock->lk_call_count = call_count; + STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->finodelk, + int_lock->domain, local->fd, + F_SETLK, flock_use, NULL); - if (!call_count){ - gf_log (this->name, GF_LOG_TRACE, - "No internal locks unlocked"); - int_lock->lock_cbk (frame, this); - goto out; - } + if (!--call_count) + break; + } + } else { + call_count = internal_lock_count (frame, this); + int_lock->lk_call_count = call_count; + int_lock->lk_expected_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (int_lock->lower_locked_nodes[i] & LOCKED_LOWER) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION, - AFR_UNLOCK_OP, basename, i); + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_NB_TRANSACTION, + AFR_LOCK_OP, &flock, F_SETLK, i); - STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk, + STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, (void *) (long) i, priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - loc, basename, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + priv->children[i]->fops->inodelk, + int_lock->domain, &local->loc, + F_SETLK, &flock, NULL); if (!--call_count) break; - } } - out: - return 0; - -} - - -static int -afr_post_unlock_higher_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.done (frame, this); - return 0; -} - -static int -afr_post_unlock_lower_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - loc_t *lower = NULL; - loc_t *higher = NULL; - const char *lower_name = NULL; - const char *higher_name = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - if (__is_higher_locked (frame, this)) { - gf_log (this->name, GF_LOG_DEBUG, - "unlocking higher"); - int_lock->lk_basename = higher_name; - int_lock->lk_loc = higher; - int_lock->lock_cbk = afr_post_unlock_higher_cbk; - - afr_unlock_entrylk (frame, this); - } else - local->transaction.done (frame, this); - - return 0; -} - -static int -afr_rename_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - loc_t *lower = NULL; - loc_t *higher = NULL; - const char *lower_name = NULL; - const char *higher_name = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - - if (__is_lower_locked (frame, this)) { - gf_log (this->name, GF_LOG_DEBUG, - "unlocking lower"); - int_lock->lk_basename = lower_name; - int_lock->lk_loc = lower; - int_lock->lock_cbk = afr_post_unlock_lower_cbk; - - afr_unlock_lower_entrylk (frame, this); - } else - afr_post_unlock_lower_cbk (frame, this); - - return 0; -} - -static int -afr_rename_transaction (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - return (local->transaction.type == - AFR_ENTRY_RENAME_TRANSACTION); - + return ret; } int32_t @@ -1665,10 +1635,8 @@ afr_unlock (call_frame_t *frame, xlator_t *this) if (is_afr_lock_transaction (local)) afr_unlock_inodelk (frame, this); else - if (!afr_rename_transaction (frame, this)) - afr_unlock_entrylk (frame, this); - else - afr_rename_unlock (frame, this); + afr_unlock_entrylk (frame, this); + } else { if (is_afr_lock_selfheal (local)) afr_unlock_inodelk (frame, this); @@ -1690,11 +1658,16 @@ afr_mark_locked_nodes (xlator_t *this, fd_t *fd, priv = this->private; - afr_fd_ctx_set (this, fd); - if (ret < 0) + ret = afr_fd_ctx_set (this, fd); + if (ret) goto out; ret = fd_ctx_get (fd, this, &tmp); + if (ret) { + gf_log (this->name, GF_LOG_INFO, + "failed to get the fd ctx"); + goto out; + } fdctx = (afr_fd_ctx_t *) (long) tmp; GF_ASSERT (fdctx->locked_on); @@ -1737,8 +1710,6 @@ __afr_save_locked_fd (xlator_t *this, fd_t *fd) locked_fd = GF_CALLOC (1, sizeof (*locked_fd), gf_afr_mt_locked_fd); if (!locked_fd) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); ret = -1; goto out; } @@ -1770,8 +1741,8 @@ afr_save_locked_fd (xlator_t *this, fd_t *fd) ret = __afr_save_locked_fd (this, fd); if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "fd=%p could not be saved"); + gf_log (this->name, GF_LOG_INFO, + "fd=%p could not be saved", fd); goto unlock; } } @@ -1821,12 +1792,10 @@ afr_get_source_lock_recovery (xlator_t *this, fd_t *fd) for (i = 0; i < priv->child_count; i++) { if (fdctx->locked_on[i]) { gf_log (this->name, GF_LOG_DEBUG, - "Found lock recovery source=%d", - i); + "Found lock recovery source=%d", i); source_child = i; break; } - } out: @@ -1836,10 +1805,12 @@ out: int32_t afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock); + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata); int32_t afr_recover_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) { afr_local_t *local = NULL; afr_private_t *priv = NULL; @@ -1850,7 +1821,7 @@ afr_recover_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, priv = this->private; if (op_ret) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_INFO, "lock recovery failed"); goto cleanup; } @@ -1863,7 +1834,7 @@ afr_recover_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, (void *) (long) source_child, priv->children[source_child], priv->children[source_child]->fops->lk, - local->fd, F_GETLK_FD, &flock); + local->fd, F_GETLK_FD, &flock, NULL); return 0; @@ -1891,7 +1862,7 @@ afr_recover_lock (call_frame_t *frame, xlator_t *this, (void *) (long) lock_recovery_child, priv->children[lock_recovery_child], priv->children[lock_recovery_child]->fops->lk, - local->fd, F_SETLK, flock); + local->fd, F_SETLK, flock, NULL); return 0; } @@ -1909,10 +1880,11 @@ is_afr_lock_eol (struct gf_flock *lock) int32_t afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) { if (op_ret) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_INFO, "Failed to get locks on fd"); goto cleanup; } @@ -1921,7 +1893,7 @@ afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "Got a lock on fd"); if (is_afr_lock_eol (lock)) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_INFO, "Reached EOL on locks on fd"); goto cleanup; } @@ -1969,7 +1941,7 @@ afr_lock_recovery (call_frame_t *frame, xlator_t *this) (void *) (long) source_child, priv->children[source_child], priv->children[source_child]->fops->lk, - local->fd, F_GETLK_FD, &flock); + local->fd, F_GETLK_FD, &flock, NULL); out: return ret; @@ -1989,7 +1961,7 @@ afr_mark_fd_opened (xlator_t *this, fd_t *fd, int32_t child_index) fdctx = (afr_fd_ctx_t *) (long) tmp; - fdctx->opened_on[child_index] = 1; + fdctx->opened_on[child_index] = AFR_FD_OPENED; out: return ret; @@ -1997,13 +1969,14 @@ out: int32_t afr_lock_recovery_preopen_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata) { int32_t child_index = (long )cookie; int ret = 0; if (op_ret) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_INFO, "Reopen during lock-recovery failed"); goto cleanup; } @@ -2013,14 +1986,14 @@ afr_lock_recovery_preopen_cbk (call_frame_t *frame, void *cookie, xlator_t *this ret = afr_lock_recovery (frame, this); if (ret) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_INFO, "Lock recovery failed"); goto cleanup; } ret = afr_mark_fd_opened (this, fd, child_index); if (ret) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_INFO, "Marking fd open failed"); goto cleanup; } @@ -2049,7 +2022,12 @@ afr_lock_recovery_preopen (call_frame_t *frame, xlator_t *this) GF_ASSERT (local && local->fd); ret = fd_ctx_get (local->fd, this, &tmp); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to get the context of fd", + uuid_utoa (local->fd->inode->gfid)); fdctx = (afr_fd_ctx_t *) (long) tmp; + /* TODO: instead we should return from the function */ GF_ASSERT (fdctx); child_index = local->lock_recovery_child; @@ -2064,8 +2042,7 @@ afr_lock_recovery_preopen (call_frame_t *frame, xlator_t *this) (void *)(long) child_index, priv->children[child_index], priv->children[child_index]->fops->open, - &loc, fdctx->flags, local->fd, - fdctx->wbflags); + &loc, fdctx->flags, local->fd, NULL); return 0; } @@ -2083,7 +2060,7 @@ is_fd_opened (fd_t *fd, int32_t child_index) fdctx = (afr_fd_ctx_t *) (long) tmp; - if (fdctx->opened_on[child_index]) + if (fdctx->opened_on[child_index] == AFR_FD_OPENED) ret = 1; out: @@ -2093,13 +2070,14 @@ out: int afr_attempt_lock_recovery (xlator_t *this, int32_t child_index) { - call_frame_t *frame = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_locked_fd_t *locked_fd = NULL; - afr_locked_fd_t *tmp = NULL; - int ret = 0; - struct list_head locks_list; + call_frame_t *frame = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_locked_fd_t *locked_fd = NULL; + afr_locked_fd_t *tmp = NULL; + int ret = -1; + struct list_head locks_list = {0,}; + int32_t op_errno = 0; priv = this->private; @@ -2109,25 +2087,14 @@ afr_attempt_lock_recovery (xlator_t *this, int32_t child_index) frame = create_frame (this, this->ctx->pool); if (!frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); ret = -1; goto out; } - local = GF_CALLOC (1, sizeof (*local), - gf_afr_mt_afr_local_t); - if (!local) { - gf_log (this->name, GF_LOG_DEBUG, - "Out of memory"); - ret = -1; - goto out; - } - - AFR_LOCAL_INIT (local, priv); - if (!local) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) { ret = -1; goto out; } @@ -2165,5 +2132,43 @@ afr_attempt_lock_recovery (xlator_t *this, int32_t child_index) } out: + if ((ret < 0) && frame) + AFR_STACK_DESTROY (frame); + return ret; +} + +int +afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, + unsigned int child_count) +{ + afr_local_t *dst_local = NULL; + afr_local_t *src_local = NULL; + afr_internal_lock_t *dst_lock = NULL; + afr_internal_lock_t *src_lock = NULL; + afr_inodelk_t *dst_inodelk = NULL; + afr_inodelk_t *src_inodelk = NULL; + int ret = -1; + + src_local = src->local; + src_lock = &src_local->internal_lock; + src_inodelk = afr_get_inodelk (src_lock, dom); + dst_local = dst->local; + dst_lock = &dst_local->internal_lock; + dst_inodelk = afr_get_inodelk (dst_lock, dom); + if (!dst_inodelk || !src_inodelk) + goto out; + if (src_inodelk->locked_nodes) { + memcpy (dst_inodelk->locked_nodes, src_inodelk->locked_nodes, + sizeof (*dst_inodelk->locked_nodes) * child_count); + memset (src_inodelk->locked_nodes, 0, + sizeof (*src_inodelk->locked_nodes) * child_count); + } + + dst_lock->transaction_lk_type = src_lock->transaction_lk_type; + dst_lock->selfheal_lk_type = src_lock->selfheal_lk_type; + dst_inodelk->lock_count = src_inodelk->lock_count; + src_inodelk->lock_count = 0; + ret = 0; +out: return ret; } diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index b4937efa2..73594f265 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2009 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -26,7 +17,6 @@ enum gf_afr_mem_types_ { gf_afr_mt_iovec = gf_common_mt_end + 1, gf_afr_mt_afr_fd_ctx_t, - gf_afr_mt_afr_local_t, gf_afr_mt_afr_private_t, gf_afr_mt_int32_t, gf_afr_mt_char, @@ -42,6 +32,19 @@ enum gf_afr_mem_types_ { gf_afr_mt_entry_name, gf_afr_mt_pump_priv, gf_afr_mt_locked_fd, + gf_afr_mt_inode_ctx_t, + gf_afr_fd_paused_call_t, + gf_afr_mt_crawl_data_t, + gf_afr_mt_brick_pos_t, + gf_afr_mt_shd_bool_t, + gf_afr_mt_shd_timer_t, + gf_afr_mt_shd_event_t, + gf_afr_mt_time_t, + gf_afr_mt_pos_data_t, + gf_afr_mt_reply_t, + gf_afr_mt_stats_t, + gf_afr_mt_shd_crawl_event_t, + gf_afr_mt_uint64_t, gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index 5569dd1e2..643a5d692 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2009 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -55,419 +46,337 @@ #include "afr-self-heal.h" #include "afr-self-heal-common.h" - int -afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) +afr_stale_child_up (afr_local_t *local, xlator_t *this) { - afr_local_t * local = frame->local; - - AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, - local->fd); - return 0; -} - - -int -afr_open_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd) -{ - afr_local_t * local = NULL; - - int child_index = (long) cookie; - - uint64_t ctx; - afr_fd_ctx_t *fd_ctx; - - int ret = 0; - - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - } - - if (op_ret >= 0) { - local->op_ret = op_ret; - local->success_count++; - - ret = afr_fd_ctx_set (this, fd); - - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not set fd ctx for fd=%p", - fd); - - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } - - ret = fd_ctx_get (fd, this, &ctx); + int i = 0; + afr_private_t *priv = NULL; + int up = -1; - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not get fd ctx for fd=%p", fd); - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } + priv = this->private; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - fd_ctx->opened_on[child_index] = 1; - fd_ctx->flags = local->cont.open.flags; - fd_ctx->wbflags = local->cont.open.wbflags; - } - } -unlock: - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); + if (!local->fresh_children) + local->fresh_children = afr_children_create (priv->child_count); + if (!local->fresh_children) + goto out; - if (call_count == 0) { - if ((local->cont.open.flags & O_TRUNC) - && (local->op_ret >= 0)) { - STACK_WIND (frame, afr_open_ftruncate_cbk, - this, this->fops->ftruncate, - fd, 0); - } else { - AFR_STACK_UNWIND (open, frame, local->op_ret, - local->op_errno, local->fd); - } - } + afr_inode_get_read_ctx (this, local->fd->inode, local->fresh_children); + if (priv->child_count == afr_get_children_count (local->fresh_children, + priv->child_count)) + goto out; - return 0; + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + if (afr_is_child_present (local->fresh_children, + priv->child_count, i)) + continue; + up = i; + break; + } +out: + return up; } - -int -afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, int32_t wbflags) +void +afr_perform_data_self_heal (call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - - int i = 0; - int ret = -1; - - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - int32_t wind_flags = flags & (~O_TRUNC); - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - - if (afr_is_split_brain (this, loc->inode)) { - /* self-heal failed */ - op_errno = EIO; - goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - frame->local = local; - call_count = local->call_count; - - loc_copy (&local->loc, loc); + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + inode_t *inode = NULL; + int st_child = -1; + char reason[64] = {0}; - local->cont.open.flags = flags; - local->cont.open.wbflags = wbflags; + local = frame->local; + sh = &local->self_heal; + inode = local->fd->inode; - local->fd = fd_ref (fd); + if (!IA_ISREG (inode->ia_type)) + goto out; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->open, - loc, wind_flags, fd, wbflags); + st_child = afr_stale_child_up (local, this); + if (st_child < 0) + goto out; - if (!--call_count) - break; - } - } + sh->do_data_self_heal = _gf_true; + sh->do_metadata_self_heal = _gf_true; + sh->do_gfid_self_heal = _gf_true; + sh->do_missing_entry_self_heal = _gf_true; - op_ret = 0; + snprintf (reason, sizeof (reason), "stale subvolume %d detected", + st_child); + afr_launch_self_heal (frame, this, inode, _gf_true, inode->ia_type, + reason, NULL, NULL); out: - if (op_ret == -1) { - AFR_STACK_UNWIND (open, frame, op_ret, op_errno, fd); - } - - return 0; + return; } - int -afr_openfd_sh_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) +afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t * local = frame->local; + afr_private_t *priv = NULL; - int ret = 0; + priv = this->private; + if (afr_open_only_data_self_heal (priv->data_self_heal)) + afr_perform_data_self_heal (frame, this); + AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, + local->fd, xdata); + return 0; +} - uint64_t ctx; - afr_fd_ctx_t *fd_ctx; - int call_count = 0; - int child_index = (long) cookie; +int +afr_open_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + fd_t *fd, dict_t *xdata) +{ + afr_local_t * local = NULL; + int ret = 0; + int call_count = -1; + int child_index = (long) cookie; + afr_private_t *priv = NULL; - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; + priv = this->private; + local = frame->local; LOCK (&frame->lock); { + if (op_ret == -1) { + local->op_errno = op_errno; + } + if (op_ret >= 0) { - ret = fd_ctx_get (fd, this, &ctx); + local->op_ret = op_ret; + local->success_count++; - if (ret < 0) { - goto out; + ret = afr_child_fd_ctx_set (this, fd, child_index, + local->cont.open.flags); + if (ret) { + local->op_ret = -1; + local->op_errno = -ret; + goto unlock; } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - fd_ctx->opened_on[child_index] = 1; - - gf_log (this->name, GF_LOG_TRACE, - "fd for %s opened successfully on subvolume %s", - local->loc.path, priv->children[child_index]->name); } } -out: +unlock: UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) { - int_lock->lock_cbk = local->transaction.done; - local->transaction.resume (frame, this); + if ((local->cont.open.flags & O_TRUNC) + && (local->op_ret >= 0)) { + STACK_WIND (frame, afr_open_ftruncate_cbk, + this, this->fops->ftruncate, + fd, 0, NULL); + } else { + if (afr_open_only_data_self_heal (priv->data_self_heal)) + afr_perform_data_self_heal (frame, this); + AFR_STACK_UNWIND (open, frame, local->op_ret, + local->op_errno, local->fd, xdata); + } } return 0; } - -static int -__unopened_count (int child_count, unsigned int *opened_on, unsigned char *child_up) -{ - int i; - int count = 0; - - for (i = 0; i < child_count; i++) { - if (!opened_on[i] && child_up[i]) - count++; - } - - return count; -} - - int -afr_openfd_sh_unwind (call_frame_t *frame, xlator_t *this) +afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - uint64_t ctx; - afr_fd_ctx_t *fd_ctx; + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + int i = 0; + int ret = -1; + int32_t call_count = 0; + int32_t op_errno = 0; + int32_t wind_flags = flags & (~O_TRUNC); + //We can't let truncation to happen outside transaction. + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + priv = this->private; + + if (flags & (O_CREAT|O_TRUNC)) { + QUORUM_CHECK(open,out); + } - int abandon = 0; - int ret = 0; - int i; - int call_count = 0; + if (afr_is_split_brain (this, loc->inode)) { + /* self-heal failed */ + gf_log (this->name, GF_LOG_WARNING, + "failed to open as split brain seen, returning EIO"); + op_errno = EIO; + goto out; + } - priv = this->private; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; - /* - * Some subvolumes might have come up on which we never - * opened this fd in the first place. Re-open fd's on those - * subvolumes now. - */ - - ret = fd_ctx_get (local->fd, this, &ctx); - - if (ret < 0) { - abandon = 1; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - LOCK (&local->fd->lock); - { - call_count = __unopened_count (priv->child_count, - fd_ctx->opened_on, - local->child_up); - for (i = 0; i < priv->child_count; i++) { - fd_ctx->pre_op_done[i] = 0; - fd_ctx->pre_op_piggyback[i] = 0; - } - } - UNLOCK (&local->fd->lock); + call_count = local->call_count; + loc_copy (&local->loc, loc); - if (call_count == 0) { - abandon = 1; - goto out; - } + local->cont.open.flags = flags; - local->call_count = call_count; + local->fd = fd_ref (fd); for (i = 0; i < priv->child_count; i++) { - if (!fd_ctx->opened_on[i] && local->child_up[i]) { - gf_log (this->name, GF_LOG_TRACE, - "opening fd for %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_openfd_sh_open_cbk, - (void *)(long) i, + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->open, - &local->loc, fd_ctx->flags, local->fd, - fd_ctx->wbflags); + loc, wind_flags, fd, xdata); if (!--call_count) break; } } + ret = 0; out: - if (abandon) - local->transaction.resume (frame, this); + if (ret < 0) + AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, xdata); return 0; } - int -afr_openfd_sh (call_frame_t *frame, xlator_t *this) +afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - char sh_type_str[256] = {0,}; - - local = frame->local; - sh = &local->self_heal; - - inode_path (local->fd->inode, NULL, (char **)&local->loc.path); - local->loc.name = strrchr (local->loc.path, '/'); - local->loc.inode = inode_ref (local->fd->inode); - local->loc.parent = inode_parent (local->fd->inode, 0, NULL); - - /* forcibly trigger missing-entries self-heal */ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int call_count = 0; + int child_index = (long) cookie; - local->success_count = 1; - local->enoent_count = 1; - - sh->data_lock_held = _gf_true; - sh->need_data_self_heal = _gf_true; - sh->type = local->fd->inode->ia_type; - sh->background = _gf_false; - sh->unwind = afr_openfd_sh_unwind; - - afr_self_heal_type_str_get(&local->self_heal, - sh_type_str, - sizeof(sh_type_str)); - gf_log (this->name, GF_LOG_NORMAL, "%s self-heal triggered. " - "path: %s, reason: Replicate up down flush, data lock is held", - sh_type_str, local->loc.path); - - afr_self_heal (frame, this); - - return 0; -} - - -int -afr_openfd_flush_done (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; + priv = this->private; + local = frame->local; - int _ret = -1; + if (op_ret >= 0) { + gf_log (this->name, GF_LOG_DEBUG, "fd for %s opened " + "successfully on subvolume %s", local->loc.path, + priv->children[child_index]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, "Failed to open %s " + "on subvolume %s", local->loc.path, + priv->children[child_index]->name); + } - priv = this->private; - local = frame->local; + fd_ctx = afr_fd_ctx_get (local->fd, this); + if (!fd_ctx) { + gf_log (this->name, GF_LOG_WARNING, + "failed to get fd context, %p", local->fd); + goto out; + } LOCK (&local->fd->lock); { - _ret = __fd_ctx_get (local->fd, this, &ctx); - - if (_ret < 0) { - goto out; + if (op_ret >= 0) { + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + } else { + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - fd_ctx->down_count = priv->down_count; - fd_ctx->up_count = priv->up_count; } -out: UNLOCK (&local->fd->lock); +out: + call_count = afr_frame_return (frame); + if (call_count == 0) + AFR_STACK_DESTROY (frame); - afr_local_transaction_cleanup (local, this); - - gf_log (this->name, GF_LOG_TRACE, - "The up/down flush is over"); + return 0; +} - fd_unref (local->fd); - local->openfd_flush_cbk (frame, this); +void +afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open) +{ + afr_private_t *priv = NULL; + int i = 0; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + int ret = -1; + int32_t op_errno = 0; + afr_fd_ctx_t *fd_ctx = NULL; - return 0; -} + priv = this->private; + if (!afr_is_fd_fixable (fd) || !need_open || !need_open_count) + goto out; -int -afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) -{ - afr_local_t * local = NULL; + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) { + ret = -1; + goto out; + } - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + frame = create_frame (this, this->ctx->pool); + if (!frame) { + ret = -1; + goto out; + } + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; - local->op = GF_FOP_FLUSH; + local->loc.inode = inode_ref (fd->inode); + ret = loc_path (&local->loc, NULL); + if (ret < 0) + goto out; local->fd = fd_ref (fd); + local->call_count = need_open_count; + + gf_log (this->name, GF_LOG_DEBUG, "need open count: %zd", + need_open_count); - local->transaction.fop = afr_openfd_sh; - local->transaction.done = afr_openfd_flush_done; + for (i = 0; i < priv->child_count; i++) { + if (!need_open[i]) + continue; - local->transaction.start = 0; - local->transaction.len = 0; + if (IA_IFDIR == fd->inode->ia_type) { + gf_log (this->name, GF_LOG_DEBUG, + "opening fd for dir %s on subvolume %s", + local->loc.path, priv->children[i]->name); - gf_log (this->name, GF_LOG_TRACE, - "doing up/down flush on fd=%p", - fd); + STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk, + (void*) (long) i, + priv->children[i], + priv->children[i]->fops->opendir, + &local->loc, local->fd, + NULL); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "opening fd for file %s on subvolume %s", + local->loc.path, priv->children[i]->name); - afr_transaction (frame, this, AFR_DATA_TRANSACTION); + STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk, + (void *)(long) i, + priv->children[i], + priv->children[i]->fops->open, + &local->loc, + fd_ctx->flags & (~O_TRUNC), + local->fd, NULL); + } + } + op_errno = 0; + ret = 0; out: - return 0; + if (op_errno) + ret = -1; //For handling ALLOC_OR_GOTO + if (ret && frame) + AFR_STACK_DESTROY (frame); } - diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c index 804783201..83846f152 100644 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c +++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.c @@ -1,23 +1,15 @@ /* - Copyright (c) 2009 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ +#include <openssl/md5.h> #include "glusterfs.h" #include "afr.h" #include "xlator.h" @@ -33,7 +25,6 @@ #include "compat-errno.h" #include "compat.h" #include "byte-order.h" -#include "md5.h" #include "afr-transaction.h" #include "afr-self-heal.h" @@ -44,751 +35,624 @@ This file contains the various self-heal algorithms */ +static int +sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, + gf_boolean_t is_first_call, call_frame_t *old_loop_frame); +static int +sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame, + int32_t op_ret, int32_t op_errno); +static int +sh_destroy_frame (call_frame_t *frame, xlator_t *this) +{ + if (!frame) + goto out; -/* - The "full" algorithm. Copies the entire file from - source to sinks. -*/ - + AFR_STACK_DESTROY (frame); +out: + return 0; +} static void -sh_full_private_cleanup (call_frame_t *frame, xlator_t *this) +sh_private_cleanup (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_full_private_t *sh_priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_sh_algo_private_t *sh_priv = NULL; local = frame->local; sh = &local->self_heal; sh_priv = sh->private; - - if (sh_priv) - GF_FREE (sh_priv); + GF_FREE (sh_priv); } - -static int -sh_full_loop_driver (call_frame_t *frame, xlator_t *this); - static int -sh_full_loop_return (call_frame_t *rw_frame, xlator_t *this, off_t offset) +sh_number_of_writes_needed (unsigned char *write_needed, int child_count) { - afr_local_t * rw_local = NULL; - afr_self_heal_t * rw_sh = NULL; - - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_full_private_t *sh_priv = NULL; - - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - sh_priv = sh->private; + int writes = 0; + int i = 0; - LOCK (&sh_priv->lock); - { - sh_priv->loops_running--; + for (i = 0; i < child_count; i++) { + if (write_needed[i]) + writes++; } - UNLOCK (&sh_priv->lock); - - gf_log (this->name, GF_LOG_TRACE, - "loop for offset %"PRId64" returned", offset); - AFR_STACK_DESTROY (rw_frame); - - sh_full_loop_driver (sh_frame, this); - - return 0; + return writes; } static int -sh_full_write_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) +sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this, + call_frame_t *last_loop_frame) { - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t *rw_sh = NULL; - - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - - int child_index = (long) cookie; - int call_count = 0; - - priv = this->private; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_sh_algo_private_t *sh_priv = NULL; + int32_t total_blocks = 0; + int32_t diff_blocks = 0; + + local = sh_frame->local; + sh = &local->self_heal; + sh_priv = sh->private; + if (sh_priv) { + total_blocks = sh_priv->total_blocks; + diff_blocks = sh_priv->diff_blocks; + } - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; + sh_private_cleanup (sh_frame, this); + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { + GF_ASSERT (!last_loop_frame); + //loop_finish should have happened and the old_loop should be NULL + gf_log (this->name, GF_LOG_DEBUG, + "self-heal aborting on %s", + local->loc.path); - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; + local->self_heal.algo_abort_cbk (sh_frame, this); + } else { + GF_ASSERT (last_loop_frame); + if (diff_blocks == total_blocks) { + gf_log (this->name, GF_LOG_DEBUG, "full self-heal " + "completed on %s",local->loc.path); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "diff self-heal on %s: completed. " + "(%d blocks of %d were different (%.2f%%))", + local->loc.path, diff_blocks, total_blocks, + ((diff_blocks * 1.0)/total_blocks) * 100); + } - gf_log (this->name, GF_LOG_TRACE, - "wrote %d bytes of data from %s to child %d, offset %"PRId64"", - op_ret, sh_local->loc.path, child_index, - rw_sh->offset - op_ret); + sh->old_loop_frame = last_loop_frame; + local->self_heal.algo_completion_cbk (sh_frame, this); + } - LOCK (&sh_frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "write to %s failed on subvolume %s (%s)", - sh_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); + return 0; +} - sh->op_failed = 1; - } - } - UNLOCK (&sh_frame->lock); +int +sh_loop_finish (call_frame_t *loop_frame, xlator_t *this) +{ + afr_local_t *loop_local = NULL; + afr_self_heal_t *loop_sh = NULL; - call_count = afr_frame_return (rw_frame); + if (!loop_frame) + goto out; - if (call_count == 0) { - sh_full_loop_return (rw_frame, this, rw_sh->offset - op_ret); - } + loop_local = loop_frame->local; + if (loop_local) { + loop_sh = &loop_local->self_heal; + } - return 0; + if (loop_sh && loop_sh->data_lock_held) { + afr_sh_data_unlock (loop_frame, this, this->name, + sh_destroy_frame); + } else { + sh_destroy_frame (loop_frame, this); + } +out: + return 0; } - static int -sh_full_read_cbk (call_frame_t *rw_frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, struct iatt *buf, - struct iobref *iobref) +sh_loop_lock_success (call_frame_t *loop_frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t *rw_sh = NULL; - - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - - int i = 0; - int call_count = 0; - - off_t offset = (long) cookie; - - priv = this->private; - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - call_count = sh->active_sinks; - - rw_local->call_count = call_count; - - gf_log (this->name, GF_LOG_TRACE, - "read %d bytes of data from %s, offset %"PRId64"", - op_ret, sh_local->loc.path, offset); - - if (op_ret <= 0) { - sh->op_failed = 1; - - sh_full_loop_return (rw_frame, this, offset); - return 0; - } + afr_local_t *loop_local = NULL; + afr_self_heal_t *loop_sh = NULL; - rw_sh->offset += op_ret; + loop_local = loop_frame->local; + loop_sh = &loop_local->self_heal; - if (sh->file_has_holes) { - if (iov_0filled (vector, count) == 0) { - /* the iter function depends on the - sh->offset already being updated - above - */ + sh_loop_finish (loop_sh->old_loop_frame, this); + loop_sh->old_loop_frame = NULL; - sh_full_loop_return (rw_frame, this, offset); - goto out; - } - } - - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] || !sh_local->child_up[i]) - continue; - - /* this is a sink, so write to it */ + gf_log (this->name, GF_LOG_DEBUG, "Acquired lock for range %"PRIu64 + " %"PRIu64, loop_sh->offset, loop_sh->block_size); + loop_sh->data_lock_held = _gf_true; + loop_sh->sh_data_algo_start (loop_frame, this); + return 0; +} - STACK_WIND_COOKIE (rw_frame, sh_full_write_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->writev, - sh->healing_fd, vector, count, offset, - iobref); +static int +sh_loop_lock_failure (call_frame_t *loop_frame, xlator_t *this) +{ + call_frame_t *sh_frame = NULL; + afr_local_t *loop_local = NULL; + afr_self_heal_t *loop_sh = NULL; + + loop_local = loop_frame->local; + loop_sh = &loop_local->self_heal; + sh_frame = loop_sh->sh_frame; + + gf_log (this->name, GF_LOG_ERROR, "failed lock for range %"PRIu64 + " %"PRIu64, loop_sh->offset, loop_sh->block_size); + sh_loop_finish (loop_sh->old_loop_frame, this); + loop_sh->old_loop_frame = NULL; + sh_loop_return (sh_frame, this, loop_frame, -1, ENOTCONN); + return 0; +} - if (!--call_count) - break; - } +static int +sh_loop_frame_create (call_frame_t *sh_frame, xlator_t *this, + call_frame_t *old_loop_frame, call_frame_t **loop_frame) +{ + call_frame_t *new_loop_frame = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_local_t *new_loop_local = NULL; + afr_self_heal_t *new_loop_sh = NULL; + afr_private_t *priv = NULL; + + GF_ASSERT (sh_frame); + GF_ASSERT (loop_frame); + + *loop_frame = NULL; + local = sh_frame->local; + sh = &local->self_heal; + priv = this->private; + + new_loop_frame = copy_frame (sh_frame); + if (!new_loop_frame) + goto out; + //We want the frame to have same lk_owner as sh_frame + //so that locks translator allows conflicting locks + new_loop_local = afr_self_heal_local_init (local, this); + if (!new_loop_local) + goto out; + new_loop_frame->local = new_loop_local; + new_loop_sh = &new_loop_local->self_heal; + new_loop_sh->sources = memdup (sh->sources, + priv->child_count * sizeof (*sh->sources)); + if (!new_loop_sh->sources) + goto out; + new_loop_sh->write_needed = GF_CALLOC (priv->child_count, + sizeof (*new_loop_sh->write_needed), + gf_afr_mt_char); + if (!new_loop_sh->write_needed) + goto out; + new_loop_sh->checksum = GF_CALLOC (priv->child_count, MD5_DIGEST_LENGTH, + gf_afr_mt_uint8_t); + if (!new_loop_sh->checksum) + goto out; + new_loop_sh->inode = inode_ref (sh->inode); + new_loop_sh->sh_data_algo_start = sh->sh_data_algo_start; + new_loop_sh->source = sh->source; + new_loop_sh->active_sinks = sh->active_sinks; + new_loop_sh->healing_fd = fd_ref (sh->healing_fd); + new_loop_sh->file_has_holes = sh->file_has_holes; + new_loop_sh->old_loop_frame = old_loop_frame; + new_loop_sh->sh_frame = sh_frame; + *loop_frame = new_loop_frame; + return 0; out: - return 0; + sh_destroy_frame (new_loop_frame, this); + return -ENOMEM; } - static int -sh_full_read_write (call_frame_t *frame, xlator_t *this, off_t offset) +sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, + call_frame_t *old_loop_frame) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t *rw_sh = NULL; - afr_self_heal_t *sh = NULL; + call_frame_t *new_loop_frame = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_local_t *new_loop_local = NULL; + afr_self_heal_t *new_loop_sh = NULL; + int ret = 0; - call_frame_t *rw_frame = NULL; + GF_ASSERT (sh_frame); - int32_t op_errno = 0; + local = sh_frame->local; + sh = &local->self_heal; - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - rw_frame = copy_frame (frame); - if (!rw_frame) + ret = sh_loop_frame_create (sh_frame, this, old_loop_frame, + &new_loop_frame); + if (ret) goto out; - - ALLOC_OR_GOTO (rw_local, afr_local_t, out); - - rw_frame->local = rw_local; - rw_sh = &rw_local->self_heal; - - rw_sh->offset = offset; - rw_sh->sh_frame = frame; - - STACK_WIND_COOKIE (rw_frame, sh_full_read_cbk, - (void *) (long) offset, - priv->children[sh->source], - priv->children[sh->source]->fops->readv, - sh->healing_fd, sh->block_size, - offset); + new_loop_local = new_loop_frame->local; + new_loop_sh = &new_loop_local->self_heal; + new_loop_sh->offset = offset; + new_loop_sh->block_size = sh->block_size; + afr_sh_data_lock (new_loop_frame, this, offset, new_loop_sh->block_size, + _gf_true, this->name, sh_loop_lock_success, sh_loop_lock_failure); return 0; - out: - sh->op_failed = 1; - - sh_full_loop_driver (frame, this); - - return 0; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + if (old_loop_frame) + sh_loop_finish (old_loop_frame, this); + sh_loop_return (sh_frame, this, new_loop_frame, -1, ENOMEM); + return 0; } - static int -sh_full_loop_driver (call_frame_t *frame, xlator_t *this) +sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, + gf_boolean_t is_first_call, call_frame_t *old_loop_frame) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_full_private_t *sh_priv = NULL; - - int loop = 0; - int recurse = 0; - - off_t offset = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; + afr_local_t * local = NULL; + afr_self_heal_t * sh = NULL; + afr_sh_algo_private_t *sh_priv = NULL; + gf_boolean_t is_driver_done = _gf_false; + blksize_t block_size = 0; + int loop = 0; + off_t offset = 0; + afr_private_t *priv = NULL; + + priv = this->private; + local = sh_frame->local; + sh = &local->self_heal; sh_priv = sh->private; - if (sh->op_failed) { - if (sh_priv->loops_running == 0) { - gf_log (this->name, GF_LOG_TRACE, - "full self-heal aborting on %s", - local->loc.path); - - sh_full_private_cleanup (frame, this); - local->self_heal.algo_abort_cbk (frame, this); - } - - goto out; - } - - if (sh_priv->offset >= sh->file_size) { - if (sh_priv->loops_running == 0) { - - gf_log (this->name, GF_LOG_TRACE, - "full self-heal completed on %s", - local->loc.path); - - sh_full_private_cleanup (frame, this); - local->self_heal.algo_completion_cbk (frame, this); - } - - goto out; - } - -spawn: - loop = 0; - recurse = 0; - LOCK (&sh_priv->lock); { - if ((sh_priv->loops_running < priv->data_self_heal_window_size) - && (sh_priv->offset < sh->file_size)) { - - gf_log (this->name, GF_LOG_TRACE, - "spawning a loop for offset %"PRId64, - sh_priv->offset); - - offset = sh_priv->offset; - sh_priv->offset += sh->block_size; - + if (!is_first_call) + sh_priv->loops_running--; + offset = sh_priv->offset; + block_size = sh->block_size; + while ((!sh->eof_reached) && + (!is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) && + (sh_priv->loops_running < priv->data_self_heal_window_size) + && (sh_priv->offset < sh->file_size)) { + + loop++; + sh_priv->offset += block_size; sh_priv->loops_running++; - loop = 1; - - if (sh_priv->offset < sh->file_size) - recurse = 1; + if (!is_first_call) + break; + } + if (0 == sh_priv->loops_running) { + is_driver_done = _gf_true; } } UNLOCK (&sh_priv->lock); - if (loop) { - sh_full_read_write (frame, this, offset); - if (recurse) - goto spawn; + if (0 == loop) { + //loop finish does unlock, but the erasing of the pending + //xattrs needs to happen before that so do not finish the loop + if (is_driver_done && + !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) + goto driver_done; + if (old_loop_frame) { + sh_loop_finish (old_loop_frame, this); + old_loop_frame = NULL; + } } -out: - return 0; -} - - -int -afr_sh_algo_full (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_full_private_t *sh_priv = NULL; - - local = frame->local; - sh = &local->self_heal; - - sh_priv = GF_CALLOC (1, sizeof (*sh_priv), - gf_afr_mt_afr_private_t); - - LOCK_INIT (&sh_priv->lock); - - sh->private = sh_priv; - - local->call_count = 0; - - sh_full_loop_driver (frame, this); - return 0; -} - - -/* - * The "diff" algorithm. Copies only those blocks whose checksums - * don't match with those of source. - */ - - -static void -sh_diff_private_cleanup (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_diff_private_t *sh_priv = NULL; - - int i; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sh_priv = sh->private; - - for (i = 0; i < priv->data_self_heal_window_size; i++) { - if (sh_priv->loops[i]) { - if (sh_priv->loops[i]->write_needed) - GF_FREE (sh_priv->loops[i]->write_needed); - - if (sh_priv->loops[i]->checksum) - GF_FREE (sh_priv->loops[i]->checksum); + //If we have more loops to form we should finish previous loop after + //the next loop lock + while (loop--) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { + // op failed in other loop, stop spawning more loops + if (old_loop_frame) { + sh_loop_finish (old_loop_frame, this); + old_loop_frame = NULL; + } + sh_loop_driver (sh_frame, this, _gf_false, NULL); + } else { + gf_log (this->name, GF_LOG_TRACE, "spawning a loop " + "for offset %"PRId64, offset); - GF_FREE (sh_priv->loops[i]); + sh_loop_start (sh_frame, this, offset, old_loop_frame); + old_loop_frame = NULL; + offset += block_size; } } - if (sh_priv) { - if (sh_priv->loops) - GF_FREE (sh_priv->loops); - - GF_FREE (sh_priv); +driver_done: + if (is_driver_done) { + sh_loop_driver_done (sh_frame, this, old_loop_frame); } - - -} - - -static uint32_t -__make_cookie (int loop_index, int child_index) -{ - uint32_t ret = (loop_index << 16) | child_index; - return ret; -} - - -static int -__loop_index (uint32_t cookie) -{ - return (cookie & 0xFFFF0000) >> 16; + return 0; } - static int -__child_index (uint32_t cookie) +sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame, + int32_t op_ret, int32_t op_errno) { - return (cookie & 0x0000FFFF); -} - + afr_local_t * loop_local = NULL; + afr_self_heal_t * loop_sh = NULL; + afr_local_t * sh_local = NULL; + afr_self_heal_t *sh = NULL; -static void -sh_diff_loop_state_reset (struct sh_diff_loop_state *loop_state, int child_count) -{ - loop_state->active = _gf_false; -// loop_state->offset = 0; - - memset (loop_state->write_needed, - 0, sizeof (*loop_state->write_needed) * child_count); - - memset (loop_state->checksum, - 0, MD5_DIGEST_LEN * child_count); -} - - -static int -sh_diff_number_of_writes_needed (unsigned char *write_needed, int child_count) -{ - int writes = 0; - int i; + sh_local = sh_frame->local; + sh = &sh_local->self_heal; - for (i = 0; i < child_count; i++) { - if (write_needed[i]) - writes++; + if (loop_frame) { + loop_local = loop_frame->local; + if (loop_local) + loop_sh = &loop_local->self_heal; + if (loop_sh) + gf_log (this->name, GF_LOG_TRACE, "loop for offset " + "%"PRId64" returned", loop_sh->offset); } - return writes; -} - + if (op_ret == -1) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_set_error (sh, op_errno); + if (loop_frame) { + sh_loop_finish (loop_frame, this); + loop_frame = NULL; + } + } -static int -sh_diff_loop_driver (call_frame_t *frame, xlator_t *this); + sh_loop_driver (sh_frame, this, _gf_false, loop_frame); + return 0; +} static int -sh_diff_loop_return (call_frame_t *rw_frame, xlator_t *this, - struct sh_diff_loop_state *loop_state) +sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *postbuf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t * rw_sh = NULL; - - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_diff_private_t *sh_priv = NULL; - - priv = this->private; - - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; + afr_private_t * priv = NULL; + afr_local_t * loop_local = NULL; + afr_self_heal_t * loop_sh = NULL; + call_frame_t *sh_frame = NULL; + afr_local_t * sh_local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = 0; + + priv = this->private; + loop_local = loop_frame->local; + loop_sh = &loop_local->self_heal; + + sh_frame = loop_sh->sh_frame; sh_local = sh_frame->local; sh = &sh_local->self_heal; - sh_priv = sh->private; + + child_index = (long) cookie; gf_log (this->name, GF_LOG_TRACE, - "loop for offset %"PRId64" returned", loop_state->offset); + "wrote %d bytes of data from %s to child %d, offset %"PRId64"", + op_ret, sh_local->loc.path, child_index, loop_sh->offset); - LOCK (&sh_priv->lock); - { - sh_priv->loops_running--; - sh_diff_loop_state_reset (loop_state, priv->child_count); + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "write to %s failed on subvolume %s (%s)", + sh_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_set_error (loop_sh, op_errno); + } else if (op_ret < loop_local->cont.writev.vector->iov_len) { + gf_log (this->name, GF_LOG_ERROR, + "incomplete write to %s on subvolume %s " + "(expected %lu, returned %d)", sh_local->loc.path, + priv->children[child_index]->name, + loop_local->cont.writev.vector->iov_len, op_ret); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } - UNLOCK (&sh_priv->lock); - AFR_STACK_DESTROY (rw_frame); + call_count = afr_frame_return (loop_frame); + + if (call_count == 0) { + iobref_unref(loop_local->cont.writev.iobref); - sh_diff_loop_driver (sh_frame, this); + sh_loop_return (sh_frame, this, loop_frame, + loop_sh->op_ret, loop_sh->op_errno); + } return 0; } - -static int -sh_diff_write_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *postbuf) +static void +sh_prune_writes_needed (call_frame_t *sh_frame, call_frame_t *loop_frame, + afr_private_t *priv) { - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t * rw_sh = NULL; - - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; + afr_local_t *sh_local = NULL; + afr_self_heal_t *sh = NULL; + afr_local_t *loop_local = NULL; + afr_self_heal_t *loop_sh = NULL; + int i = 0; - afr_sh_algo_diff_private_t *sh_priv; - struct sh_diff_loop_state *loop_state; + sh_local = sh_frame->local; + sh = &sh_local->self_heal; - int call_count = 0; - int child_index = 0; - int loop_index = 0; + if (!strcmp (sh->algo->name, "diff")) + return; - priv = this->private; - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; + loop_local = loop_frame->local; + loop_sh = &loop_local->self_heal; - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - sh_priv = sh->private; - - child_index = __child_index ((uint32_t) (long) cookie); - loop_index = __loop_index ((uint32_t) (long) cookie); - loop_state = sh_priv->loops[loop_index]; - - gf_log (this->name, GF_LOG_TRACE, - "wrote %d bytes of data from %s to child %d, offset %"PRId64"", - op_ret, sh_local->loc.path, child_index, - loop_state->offset); - - LOCK (&sh_frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "write to %s failed on subvolume %s (%s)", - sh_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - sh->op_failed = 1; - } - } - UNLOCK (&sh_frame->lock); - - call_count = afr_frame_return (rw_frame); - - if (call_count == 0) { - sh_diff_loop_return (rw_frame, this, loop_state); - } - - return 0; + /* full self-heal guarantees there exists atleast 1 file with size 0 + * That means for other files we can preserve holes that come after + * its size before 'trim' + */ + for (i = 0; i < priv->child_count; i++) { + if (loop_sh->write_needed[i] && + ((loop_sh->offset + 1) > sh->buf[i].ia_size)) + loop_sh->write_needed[i] = 0; + } } - static int -sh_diff_read_cbk (call_frame_t *rw_frame, void *cookie, +sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, int32_t count, struct iatt *buf, - struct iobref *iobref) + struct iobref *iobref, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t * rw_sh = NULL; - - afr_sh_algo_diff_private_t * sh_priv = NULL; - - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - - int loop_index; - struct sh_diff_loop_state *loop_state; - - uint32_t wcookie; - - int i = 0; - int call_count = 0; - - priv = this->private; - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; + afr_private_t * priv = NULL; + afr_local_t * loop_local = NULL; + afr_self_heal_t * loop_sh = NULL; + call_frame_t *sh_frame = NULL; + int i = 0; + int call_count = 0; + afr_local_t * sh_local = NULL; + afr_self_heal_t * sh = NULL; + + priv = this->private; + loop_local = loop_frame->local; + loop_sh = &loop_local->self_heal; + + sh_frame = loop_sh->sh_frame; sh_local = sh_frame->local; sh = &sh_local->self_heal; - sh_priv = sh->private; - - loop_index = __loop_index ((uint32_t) (long) cookie); - loop_state = sh_priv->loops[loop_index]; - - call_count = sh_diff_number_of_writes_needed (loop_state->write_needed, - priv->child_count); - rw_local->call_count = call_count; - - gf_log (this->name, GF_LOG_TRACE, - "read %d bytes of data from %s, offset %"PRId64"", - op_ret, sh_local->loc.path, loop_state->offset); - - if ((op_ret <= 0) || - (call_count == 0)) { - sh_diff_loop_return (rw_frame, this, loop_state); + gf_log (this->name, GF_LOG_TRACE, + "read %d bytes of data from %s, offset %"PRId64"", + op_ret, loop_local->loc.path, loop_sh->offset); + + if (op_ret <= 0) { + if (op_ret < 0) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + gf_log (this->name, GF_LOG_ERROR, "read failed on %d " + "for %s reason :%s", sh->source, + sh_local->loc.path, strerror (errno)); + } else { + sh->eof_reached = _gf_true; + gf_log (this->name, GF_LOG_DEBUG, "Eof reached for %s", + sh_local->loc.path); + } + sh_loop_return (sh_frame, this, loop_frame, op_ret, op_errno); + goto out; + } - return 0; - } + if (loop_sh->file_has_holes && iov_0filled (vector, count) == 0) + sh_prune_writes_needed (sh_frame, loop_frame, priv); - if (sh->file_has_holes) { - if (iov_0filled (vector, count) == 0) { + call_count = sh_number_of_writes_needed (loop_sh->write_needed, + priv->child_count); + if (call_count == 0) { + sh_loop_return (sh_frame, this, loop_frame, 0, 0); + goto out; + } - sh_diff_loop_return (rw_frame, this, loop_state); - goto out; - } - } + loop_local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (loop_state->write_needed[i]) { - wcookie = __make_cookie (loop_index, i); + /* + * We only really need the request size at the moment, but the buffer + * is required if we want to issue a retry in the event of a short write. + * Therefore, we duplicate the vector and ref the iobref here... + */ + loop_local->cont.writev.vector = iov_dup(vector, count); + loop_local->cont.writev.iobref = iobref_ref(iobref); - STACK_WIND_COOKIE (rw_frame, sh_diff_write_cbk, - (void *) (long) wcookie, - priv->children[i], - priv->children[i]->fops->writev, - sh->healing_fd, vector, count, - loop_state->offset, iobref); + for (i = 0; i < priv->child_count; i++) { + if (!loop_sh->write_needed[i]) + continue; + STACK_WIND_COOKIE (loop_frame, sh_loop_write_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->writev, + loop_sh->healing_fd, vector, count, + loop_sh->offset, 0, iobref, NULL); - if (!--call_count) - break; - } + if (!--call_count) + break; } out: - return 0; + return 0; } static int -sh_diff_read (call_frame_t *rw_frame, xlator_t *this, - int loop_index) +sh_loop_read (call_frame_t *loop_frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t * rw_sh = NULL; - - afr_sh_algo_diff_private_t * sh_priv = NULL; - struct sh_diff_loop_state *loop_state; - - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - - uint32_t cookie; - - priv = this->private; - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - sh_priv = sh->private; - - loop_state = sh_priv->loops[loop_index]; + afr_private_t *priv = NULL; + afr_local_t *loop_local = NULL; + afr_self_heal_t *loop_sh = NULL; - cookie = __make_cookie (loop_index, sh->source); + priv = this->private; + loop_local = loop_frame->local; + loop_sh = &loop_local->self_heal; - STACK_WIND_COOKIE (rw_frame, sh_diff_read_cbk, - (void *) (long) cookie, - priv->children[sh->source], - priv->children[sh->source]->fops->readv, - sh->healing_fd, sh_priv->block_size, - loop_state->offset); + STACK_WIND_COOKIE (loop_frame, sh_loop_read_cbk, + (void *) (long) loop_sh->source, + priv->children[loop_sh->source], + priv->children[loop_sh->source]->fops->readv, + loop_sh->healing_fd, loop_sh->block_size, + loop_sh->offset, 0, NULL); - return 0; + return 0; } static int -sh_diff_checksum_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this, +sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - uint32_t weak_checksum, uint8_t *strong_checksum) + uint32_t weak_checksum, uint8_t *strong_checksum, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t *rw_sh = NULL; - - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - - afr_sh_algo_diff_private_t * sh_priv = NULL; + afr_private_t *priv = NULL; + afr_local_t *loop_local = NULL; + afr_self_heal_t *loop_sh = NULL; + call_frame_t *sh_frame = NULL; + afr_local_t *sh_local = NULL; + afr_self_heal_t *sh = NULL; + afr_sh_algo_private_t *sh_priv = NULL; + int child_index = 0; + int call_count = 0; + int i = 0; + int write_needed = 0; - int loop_index = 0; - int child_index = 0; - struct sh_diff_loop_state *loop_state; - - int call_count = 0; - int i = 0; - int write_needed = 0; - - priv = this->private; + priv = this->private; - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; + loop_local = loop_frame->local; + loop_sh = &loop_local->self_heal; - sh_frame = rw_sh->sh_frame; + sh_frame = loop_sh->sh_frame; sh_local = sh_frame->local; sh = &sh_local->self_heal; sh_priv = sh->private; - child_index = __child_index ((uint32_t) (long) cookie); - loop_index = __loop_index ((uint32_t) (long) cookie); - - loop_state = sh_priv->loops[loop_index]; + child_index = (long) cookie; if (op_ret < 0) { gf_log (this->name, GF_LOG_ERROR, "checksum on %s failed on subvolume %s (%s)", sh_local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } else { - memcpy (loop_state->checksum + child_index * MD5_DIGEST_LEN, - strong_checksum, - MD5_DIGEST_LEN); + memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LENGTH, + strong_checksum, MD5_DIGEST_LENGTH); } - call_count = afr_frame_return (rw_frame); + call_count = afr_frame_return (loop_frame); if (call_count == 0) { for (i = 0; i < priv->child_count; i++) { if (sh->sources[i] || !sh_local->child_up[i]) continue; - if (memcmp (loop_state->checksum + (i * MD5_DIGEST_LEN), - loop_state->checksum + (sh->source * MD5_DIGEST_LEN), - MD5_DIGEST_LEN)) { + if (memcmp (loop_sh->checksum + (i * MD5_DIGEST_LENGTH), + loop_sh->checksum + (sh->source * MD5_DIGEST_LENGTH), + MD5_DIGEST_LENGTH)) { /* - Checksums differ, so this block - must be written to this sink + Checksums differ, so this block + must be written to this sink */ - gf_log (this->name, GF_LOG_TRACE, + gf_log (this->name, GF_LOG_DEBUG, "checksum on subvolume %s at offset %" PRId64" differs from that on source", - priv->children[i]->name, loop_state->offset); + priv->children[i]->name, loop_sh->offset); - write_needed = loop_state->write_needed[i] = 1; + write_needed = loop_sh->write_needed[i] = 1; } } @@ -800,270 +664,171 @@ sh_diff_checksum_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this, } UNLOCK (&sh_priv->lock); - if (write_needed && !sh->op_failed) { - sh_diff_read (rw_frame, this, loop_index); + if (write_needed && + !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { + sh_loop_read (loop_frame, this); } else { - sh->offset += sh_priv->block_size; - - sh_diff_loop_return (rw_frame, this, loop_state); + sh_loop_return (sh_frame, this, loop_frame, + op_ret, op_errno); } } return 0; } - -static int -sh_diff_find_unused_loop (afr_sh_algo_diff_private_t *sh_priv, int max) -{ - int i; - - LOCK (&sh_priv->lock); - { - for (i = 0; i < max; i++) { - if (sh_priv->loops[i]->active == _gf_false) { - sh_priv->loops[i]->active = _gf_true; - break; - } - } - } - UNLOCK (&sh_priv->lock); - - if (i == max) { - gf_log ("[sh-diff]", GF_LOG_ERROR, - "no free loops found! This shouldn't happen. Please" - " report this to gluster-devel@nongnu.org"); - } - - return i; -} - - static int -sh_diff_checksum (call_frame_t *frame, xlator_t *this, off_t offset) +sh_diff_checksum (call_frame_t *loop_frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t * sh = NULL; - afr_self_heal_t * rw_sh = NULL; - - afr_sh_algo_diff_private_t * sh_priv = NULL; - - call_frame_t *rw_frame = NULL; - - uint32_t cookie; - int loop_index = 0; - struct sh_diff_loop_state *loop_state = NULL; - - int32_t op_errno = 0; - - int call_count = 0; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; + afr_private_t *priv = NULL; + afr_local_t *loop_local = NULL; + afr_self_heal_t *loop_sh = NULL; + int call_count = 0; + int i = 0; - sh_priv = sh->private; - - rw_frame = copy_frame (frame); - if (!rw_frame) - goto out; - - ALLOC_OR_GOTO (rw_local, afr_local_t, out); - - rw_frame->local = rw_local; - rw_sh = &rw_local->self_heal; - - rw_sh->offset = sh->offset; - rw_sh->sh_frame = frame; + priv = this->private; + loop_local = loop_frame->local; + loop_sh = &loop_local->self_heal; - call_count = sh->active_sinks + 1; /* sinks and source */ + call_count = loop_sh->active_sinks + 1; /* sinks and source */ - rw_local->call_count = call_count; + loop_local->call_count = call_count; - loop_index = sh_diff_find_unused_loop (sh_priv, priv->data_self_heal_window_size); - - loop_state = sh_priv->loops[loop_index]; - loop_state->offset = offset; - - /* we need to send both the loop index and child index, - so squeeze them both into a 32-bit number */ - - cookie = __make_cookie (loop_index, sh->source); - - STACK_WIND_COOKIE (rw_frame, sh_diff_checksum_cbk, - (void *) (long) cookie, - priv->children[sh->source], - priv->children[sh->source]->fops->rchecksum, - sh->healing_fd, - offset, sh_priv->block_size); + STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk, + (void *) (long) loop_sh->source, + priv->children[loop_sh->source], + priv->children[loop_sh->source]->fops->rchecksum, + loop_sh->healing_fd, + loop_sh->offset, loop_sh->block_size, NULL); for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] || !local->child_up[i]) + if (loop_sh->sources[i] || !loop_local->child_up[i]) continue; - cookie = __make_cookie (loop_index, i); - - STACK_WIND_COOKIE (rw_frame, sh_diff_checksum_cbk, - (void *) (long) cookie, + STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk, + (void *) (long) i, priv->children[i], priv->children[i]->fops->rchecksum, - sh->healing_fd, - offset, sh_priv->block_size); + loop_sh->healing_fd, + loop_sh->offset, loop_sh->block_size, NULL); if (!--call_count) break; } return 0; - -out: - sh->op_failed = 1; - - sh_diff_loop_driver (frame, this); - - return 0; } - static int -sh_diff_loop_driver (call_frame_t *frame, xlator_t *this) +sh_full_read_write_to_sinks (call_frame_t *loop_frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_diff_private_t *sh_priv = NULL; - - int loop = 0; - int recurse = 0; - - off_t offset = 0; - char sh_type_str[256] = {0,}; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - sh_priv = sh->private; - - afr_self_heal_type_str_get(sh, sh_type_str, sizeof(sh_type_str)); - - if (sh->op_failed) { - if (sh_priv->loops_running == 0) { - gf_log (this->name, GF_LOG_ERROR, - "diff %s self-heal aborting on %s", - sh_type_str, local->loc.path); - - sh_diff_private_cleanup (frame, this); - local->self_heal.algo_abort_cbk (frame, this); - } - - goto out; - } - - if (sh_priv->offset >= sh->file_size) { - if (sh_priv->loops_running == 0) { - gf_log (this->name, GF_LOG_TRACE, - "diff %s self-heal completed on %s", - sh_type_str, local->loc.path); + afr_private_t *priv = NULL; + afr_local_t *loop_local = NULL; + afr_self_heal_t *loop_sh = NULL; + int i = 0; + priv = this->private; + loop_local = loop_frame->local; + loop_sh = &loop_local->self_heal; - gf_log (this->name, GF_LOG_NORMAL, - "diff %s self-heal on %s: %d blocks of %d were different (%.2f%%)", - sh_type_str, local->loc.path, - sh_priv->diff_blocks, sh_priv->total_blocks, - ((sh_priv->diff_blocks * 1.0)/sh_priv->total_blocks) * 100); - - sh_diff_private_cleanup (frame, this); - local->self_heal.algo_completion_cbk (frame, this); - } - - goto out; - } - -spawn: - loop = 0; - recurse = 0; - - LOCK (&sh_priv->lock); - { - if ((sh_priv->loops_running < priv->data_self_heal_window_size) - && (sh_priv->offset < sh->file_size)) { - - gf_log (this->name, GF_LOG_TRACE, - "spawning a loop for offset %"PRId64, - sh_priv->offset); - - offset = sh_priv->offset; - sh_priv->offset += sh_priv->block_size; - - sh_priv->loops_running++; - - loop = 1; - - if (sh_priv->offset < sh->file_size) - recurse = 1; - } - } - UNLOCK (&sh_priv->lock); - - if (loop) { - sh_diff_checksum (frame, this, offset); - if (recurse) - goto spawn; + for (i = 0; i < priv->child_count; i++) { + if (loop_sh->sources[i] || !loop_local->child_up[i]) + continue; + loop_sh->write_needed[i] = 1; } - -out: + sh_loop_read (loop_frame, this); return 0; } - -int -afr_sh_algo_diff (call_frame_t *frame, xlator_t *this) +afr_sh_algo_private_t* +afr_sh_priv_init () { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_diff_private_t *sh_priv = NULL; - - int i; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; + afr_sh_algo_private_t *sh_priv = NULL; sh_priv = GF_CALLOC (1, sizeof (*sh_priv), gf_afr_mt_afr_private_t); - - sh_priv->block_size = this->ctx->page_size; - - sh->private = sh_priv; + if (!sh_priv) + goto out; LOCK_INIT (&sh_priv->lock); +out: + return sh_priv; +} - local->call_count = 0; +int +afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, char *dom, + unsigned int child_count) +{ + afr_local_t *dst_local = NULL; + afr_self_heal_t *dst_sh = NULL; + afr_local_t *src_local = NULL; + afr_self_heal_t *src_sh = NULL; + int ret = -1; + + dst_local = dst->local; + dst_sh = &dst_local->self_heal; + src_local = src->local; + src_sh = &src_local->self_heal; + GF_ASSERT (src_sh->data_lock_held); + GF_ASSERT (!dst_sh->data_lock_held); + ret = afr_lk_transfer_datalock (dst, src, dom, child_count); + if (ret) + return ret; + src_sh->data_lock_held = _gf_false; + dst_sh->data_lock_held = _gf_true; + return 0; +} - sh_priv->loops = GF_CALLOC (priv->data_self_heal_window_size, - sizeof (*sh_priv->loops), - gf_afr_mt_sh_diff_loop_state); +int +afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this, + afr_sh_algo_fn sh_data_algo_start) +{ + call_frame_t *first_loop_frame = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int ret = 0; + afr_private_t *priv = NULL; - for (i = 0; i < priv->data_self_heal_window_size; i++) { - sh_priv->loops[i] = GF_CALLOC (1, sizeof (*sh_priv->loops[i]), - gf_afr_mt_sh_diff_loop_state); + local = sh_frame->local; + sh = &local->self_heal; + priv = this->private; - sh_priv->loops[i]->checksum = GF_CALLOC (priv->child_count, - MD5_DIGEST_LEN, gf_afr_mt_uint8_t); - sh_priv->loops[i]->write_needed = GF_CALLOC (priv->child_count, - sizeof (*sh_priv->loops[i]->write_needed), - gf_afr_mt_char); + sh->sh_data_algo_start = sh_data_algo_start; + local->call_count = 0; + ret = sh_loop_frame_create (sh_frame, this, NULL, &first_loop_frame); + if (ret) + goto out; + ret = afr_sh_transfer_lock (first_loop_frame, sh_frame, this->name, + priv->child_count); + if (ret) + goto out; + sh->private = afr_sh_priv_init (); + if (!sh->private) { + ret = -1; + goto out; } + sh_loop_driver (sh_frame, this, _gf_true, first_loop_frame); + ret = 0; +out: + if (ret) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + sh_loop_driver_done (sh_frame, this, NULL); + } + return 0; +} - sh_diff_loop_driver (frame, this); - +int +afr_sh_algo_diff (call_frame_t *sh_frame, xlator_t *this) +{ + afr_sh_start_loops (sh_frame, this, sh_diff_checksum); return 0; } +int +afr_sh_algo_full (call_frame_t *sh_frame, xlator_t *this) +{ + afr_sh_start_loops (sh_frame, this, sh_full_read_write_to_sinks); + return 0; +} struct afr_sh_algorithm afr_self_heal_algorithms[] = { {.name = "full", .fn = afr_sh_algo_full}, diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.h b/xlators/cluster/afr/src/afr-self-heal-algorithm.h index 4b27d2afe..6b20789b1 100644 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.h +++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.h @@ -1,26 +1,16 @@ /* - Copyright (c) 2009 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __AFR_SELF_HEAL_ALGORITHM_H__ #define __AFR_SELF_HEAL_ALGORITHM_H__ - typedef int (*afr_sh_algo_fn) (call_frame_t *frame, xlator_t *this); @@ -30,31 +20,13 @@ struct afr_sh_algorithm { }; extern struct afr_sh_algorithm afr_self_heal_algorithms[3]; - typedef struct { gf_lock_t lock; unsigned int loops_running; off_t offset; -} afr_sh_algo_full_private_t; - -struct sh_diff_loop_state { - off_t offset; - unsigned char *write_needed; - uint8_t *checksum; - gf_boolean_t active; -}; - -typedef struct { - size_t block_size; - - gf_lock_t lock; - unsigned int loops_running; - off_t offset; int32_t total_blocks; int32_t diff_blocks; - - struct sh_diff_loop_state **loops; -} afr_sh_algo_diff_private_t; +} afr_sh_algo_private_t; #endif /* __AFR_SELF_HEAL_ALGORITHM_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 83be5ed8c..ef92b4205 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2009 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include "glusterfs.h" @@ -27,6 +18,67 @@ #include "afr-self-heal.h" #include "pump.h" +#define ADD_FMT_STRING(msg, off, sh_str, status, print_log) \ + do { \ + if (AFR_SELF_HEAL_NOT_ATTEMPTED != status) { \ + off += snprintf (msg + off, sizeof (msg) - off, \ + " "sh_str" self heal %s,", \ + get_sh_completion_status (status));\ + print_log = 1; \ + } \ + } while (0) + +#define ADD_FMT_STRING_SYNC(msg, off, sh_str, status, print_log) \ + do { \ + if (AFR_SELF_HEAL_SYNC_BEGIN == status || \ + AFR_SELF_HEAL_FAILED == status) { \ + off += snprintf (msg + off, sizeof (msg) - off, \ + " "sh_str" self heal %s,", \ + get_sh_completion_status (status));\ + print_log = 1; \ + } \ + } while (0) + + +void +afr_sh_reset (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + memset (sh->child_errno, 0, + sizeof (*sh->child_errno) * priv->child_count); + memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count); + memset (sh->parentbufs, 0, + sizeof (*sh->parentbufs) * priv->child_count); + memset (sh->success, 0, sizeof (*sh->success) * priv->child_count); + memset (sh->locked_nodes, 0, + sizeof (*sh->locked_nodes) * priv->child_count); + sh->active_sinks = 0; + + afr_reset_xattr (sh->xattr, priv->child_count); +} + +//Intersection[child]=1 if child is part of intersection +void +afr_children_intersection_get (int32_t *set1, int32_t *set2, + int *intersection, unsigned int child_count) +{ + int i = 0; + + memset (intersection, 0, sizeof (*intersection) * child_count); + for (i = 0; i < child_count; i++) { + intersection[i] = afr_is_child_present (set1, child_count, i) + && afr_is_child_present (set2, child_count, + i); + } +} + /** * select_source - select a source and return it */ @@ -34,118 +86,215 @@ int afr_sh_select_source (int sources[], int child_count) { - int i; - for (i = 0; i < child_count; i++) - if (sources[i]) - return i; + int i = 0; + for (i = 0; i < child_count; i++) + if (sources[i]) + return i; - return -1; + return -1; } +void +afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int active_sinks = 0; -/** - * sink_count - return number of sinks in sources array - */ + local = frame->local; + sh = &local->self_heal; + priv = this->private; -int -afr_sh_sink_count (int sources[], int child_count) -{ - int i; - int sinks = 0; - for (i = 0; i < child_count; i++) - if (!sources[i]) - sinks++; - return sinks; + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] == 0 && local->child_up[i] == 1) { + active_sinks++; + sh->success[i] = 1; + } else if (sh->sources[i] == 1 && local->child_up[i] == 1) { + sh->success[i] = 1; + } + } + sh->active_sinks = active_sinks; } int afr_sh_source_count (int sources[], int child_count) { - int i; - int nsource = 0; + int i = 0; + int nsource = 0; - for (i = 0; i < child_count; i++) - if (sources[i]) - nsource++; - return nsource; + for (i = 0; i < child_count; i++) + if (sources[i]) + nsource++; + return nsource; } +void +afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno) +{ + sh->op_ret = -1; + sh->op_errno = afr_most_important_error(sh->op_errno, op_errno, + _gf_false); +} -int -afr_sh_supress_errenous_children (int sources[], int child_errno[], - int child_count) +void +afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) { - int i = 0; + afr_private_t * priv = this->private; + char *buf = NULL; + char *ptr = NULL; + int i = 0; + int j = 0; - for (i = 0; i < child_count; i++) { - if (child_errno[i] && sources[i]) { - sources[i] = 0; - } - } + /* 10 digits per entry + 1 space + '[' and ']' */ + buf = GF_MALLOC (priv->child_count * 11 + 8, gf_afr_mt_char); - return 0; + for (i = 0; i < priv->child_count; i++) { + ptr = buf; + ptr += sprintf (ptr, "[ "); + for (j = 0; j < priv->child_count; j++) { + ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); + } + sprintf (ptr, "]"); + gf_log (this->name, GF_LOG_DEBUG, "pending_matrix: %s", buf); + } + + GF_FREE (buf); } +char* +afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this) +{ + afr_private_t * priv = this->private; + char *buf = NULL; + char *ptr = NULL; + int i = 0; + int j = 0; + int child_count = priv->child_count; + char *matrix_begin = "[ [ "; + char *matrix_end = "] ]"; + char *seperator = "] [ "; + int pending_entry_strlen = 12; //Including space after entry + int matrix_begin_strlen = 0; + int matrix_end_strlen = 0; + int seperator_strlen = 0; + int string_length = 0; + char *msg = "- Pending matrix: "; + + /* + * for a list of lists of [ [ a b ] [ c d ] ] + * */ + + matrix_begin_strlen = strlen (matrix_begin); + matrix_end_strlen = strlen (matrix_end); + seperator_strlen = strlen (seperator); + string_length = matrix_begin_strlen + matrix_end_strlen + + (child_count -1) * seperator_strlen + + (child_count * child_count * pending_entry_strlen); + + buf = GF_CALLOC (1, 1 + strlen (msg) + string_length , gf_afr_mt_char); + if (!buf) + goto out; + + ptr = buf; + ptr += sprintf (ptr, "%s", msg); + ptr += sprintf (ptr, "%s", matrix_begin); + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); + } + if (i < priv->child_count -1) + ptr += sprintf (ptr, "%s", seperator); + } + + ptr += sprintf (ptr, "%s", matrix_end); + +out: + return buf; +} void -afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) +afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, + const char *loc) { - afr_private_t * priv = this->private; + char *buf = NULL; + char *free_ptr = NULL; - char *buf = NULL; - char *ptr = NULL; + buf = afr_get_pending_matrix_str (pending_matrix, this); + if (buf) + free_ptr = buf; + else + buf = ""; - int i, j; - /* 10 digits per entry + 1 space + '[' and ']' */ - buf = GF_MALLOC (priv->child_count * 11 + 8, gf_afr_mt_char); + gf_log (this->name, GF_LOG_ERROR, "Unable to self-heal contents of '%s'" + " (possible split-brain). Please delete the file from all but " + "the preferred subvolume.%s", loc, buf); + GF_FREE (free_ptr); + return; +} - for (i = 0; i < priv->child_count; i++) { - ptr = buf; - ptr += sprintf (ptr, "[ "); - for (j = 0; j < priv->child_count; j++) { - ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); - } - sprintf (ptr, "]"); - gf_log (this->name, GF_LOG_TRACE, - "pending_matrix: %s", buf); - } - GF_FREE (buf); -} +void +afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count) +{ + int i = 0; + int j = 0; + + GF_ASSERT (pending_matrix); + for (i = 0; i < child_count; i++) { + for (j = 0; j < child_count; j++) { + pending_matrix[i][j] = 0; + } + } +} void -afr_sh_build_pending_matrix (afr_private_t *priv, - int32_t *pending_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type) +afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix, + unsigned char *ignorant_subvols, + size_t child_count) { - int i, j, k; + int i = 0; + int j = 0; - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3]; - void *pending_raw = NULL; - int ret = -1; + GF_ASSERT (pending_matrix); + GF_ASSERT (ignorant_subvols); - unsigned char *ignorant_subvols = NULL; + for (i = 0; i < child_count; i++) { + if (ignorant_subvols[i]) { + for (j = 0; j < child_count; j++) { + if (!ignorant_subvols[j]) + pending_matrix[j][i] += 1; + } + } + } +} - ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), child_count, - gf_afr_mt_char); +int +afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, + unsigned char *ignorant_subvols, + dict_t *xattr[], afr_transaction_type type, + size_t child_count) +{ + /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ + int32_t pending[3] = {0,}; + void *pending_raw = NULL; + int ret = -1; + int i = 0; + int j = 0; + int k = 0; - /* start clean */ - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - pending_matrix[i][j] = 0; - } - } + afr_init_pending_matrix (pending_matrix, child_count); - for (i = 0; i < child_count; i++) { - pending_raw = NULL; + for (i = 0; i < child_count; i++) { + pending_raw = NULL; for (j = 0; j < child_count; j++) { - ret = dict_get_ptr (xattr[i], priv->pending_key[j], + ret = dict_get_ptr (xattr[i], pending_key[j], &pending_raw); - + if (ret != 0) { /* * There is no xattr present. This means this @@ -153,63 +302,26 @@ afr_sh_build_pending_matrix (afr_private_t *priv, * subvolume. */ - ignorant_subvols[i] = 1; + if (ignorant_subvols) + ignorant_subvols[i] = 1; continue; } - memcpy (pending, pending_raw, sizeof(pending)); + memcpy (pending, pending_raw, sizeof(pending)); k = afr_index_for_transaction_type (type); - - pending_matrix[i][j] = ntoh32 (pending[k]); - } - } - /* - * Make all non-ignorant subvols point towards the ignorant - * subvolumes. - */ - - for (i = 0; i < child_count; i++) { - if (ignorant_subvols[i]) { - for (j = 0; j < child_count; j++) { - if (!ignorant_subvols[j]) - pending_matrix[j][i] += 1; - } + pending_matrix[i][j] = ntoh32 (pending[k]); } } - GF_FREE (ignorant_subvols); + return ret; } - -/** - * mark_sources: Mark all 'source' nodes and return number of source - * nodes found - * - * A node (a row in the pending matrix) belongs to one of - * three categories: - * - * M is the pending matrix. - * - * 'innocent' - M[i] is all zeroes - * 'fool' - M[i] has i'th element = 1 (self-reference) - * 'wise' - M[i] has i'th element = 0, others are 1 or 0. - * - * All 'innocent' nodes are sinks. If all nodes are innocent, no self-heal is - * needed. - * - * A 'wise' node can be a source. If two 'wise' nodes conflict, it is - * a split-brain. If one wise node refers to the other but the other doesn't - * refer back, the referrer is a source. - * - * All fools are sinks, unless there are no 'wise' nodes. In that case, - * one of the fools is made a source. - */ - typedef enum { + AFR_NODE_INVALID, AFR_NODE_INNOCENT, AFR_NODE_FOOL, - AFR_NODE_WISE + AFR_NODE_WISE, } afr_node_type; typedef struct { @@ -250,7 +362,7 @@ afr_sh_is_wise (int32_t *array, int i, int child_count) static int -afr_sh_all_nodes_innocent (afr_node_character *characters, +afr_sh_all_nodes_innocent (afr_node_character *characters, int child_count) { int i = 0; @@ -286,10 +398,10 @@ afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count) /* * The 'wisdom' of a wise node is 0 if any other wise node accuses it. - * It is 1 if no other wise node accuses it. + * It is 1 if no other wise node accuses it. * Only wise nodes with wisdom 1 are sources. * - * If no nodes with wisdom 1 exist, a split-brain has occured. + * If no nodes with wisdom 1 exist, a split-brain has occurred. */ static void @@ -306,7 +418,7 @@ afr_sh_compute_wisdom (int32_t *pending_matrix[], for (j = 0; j < child_count; j++) { if ((characters[j].type == AFR_NODE_WISE) && pending_matrix[j][i]) { - + characters[i].wisdom = 0; } } @@ -316,7 +428,7 @@ afr_sh_compute_wisdom (int32_t *pending_matrix[], static int -afr_sh_wise_nodes_conflict (afr_node_character *characters, +afr_sh_wise_nodes_conflict (afr_node_character *characters, int child_count) { int i = 0; @@ -337,13 +449,12 @@ afr_sh_wise_nodes_conflict (afr_node_character *characters, static int -afr_sh_mark_wisest_as_sources (int sources[], - afr_node_character *characters, +afr_sh_mark_wisest_as_sources (int sources[], + afr_node_character *characters, int child_count) { int nsources = 0; - - int i = 0; + int i = 0; for (i = 0; i < child_count; i++) { if (characters[i].wisdom == 1) { @@ -355,1108 +466,1778 @@ afr_sh_mark_wisest_as_sources (int sources[], return nsources; } - -static int -afr_sh_mark_if_size_differs (afr_self_heal_t *sh, int child_count) +static void +afr_compute_witness_of_fools (int32_t *witnesses, int32_t **pending_matrix, + afr_node_character *characters, + int32_t child_count) { - int32_t ** pending_matrix; - int i, j; + int i = 0; + int j = 0; + int witness = 0; - int size_differs = 0; - - pending_matrix = sh->pending_matrix; + GF_ASSERT (witnesses); + GF_ASSERT (pending_matrix); + GF_ASSERT (characters); + GF_ASSERT (child_count > 0); for (i = 0; i < child_count; i++) { + if (characters[i].type != AFR_NODE_FOOL) + continue; + + witness = 0; for (j = 0; j < child_count; j++) { - if (!sh->buf) - break; + if (i == j) + continue; + witness += pending_matrix[i][j]; + } + witnesses[i] = witness; + } +} - if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[j]) - && (pending_matrix[i][j] == 0) - && (pending_matrix[j][i] == 0)) { +static int32_t +afr_find_biggest_witness_among_fools (int32_t *witnesses, + afr_node_character *characters, + int32_t child_count) +{ + int i = 0; + int biggest_witness = -1; + int biggest_witness_idx = -1; + int biggest_witness_cnt = -1; - pending_matrix[i][j] = 1; - pending_matrix[j][i] = 1; + GF_ASSERT (witnesses); + GF_ASSERT (characters); + GF_ASSERT (child_count > 0); - size_differs = 1; - } - } + for (i = 0; i < child_count; i++) { + if (characters[i].type != AFR_NODE_FOOL) + continue; + + if (biggest_witness < witnesses[i]) { + biggest_witness = witnesses[i]; + biggest_witness_idx = i; + biggest_witness_cnt = 1; + continue; + } + + if (biggest_witness == witnesses[i]) + biggest_witness_cnt++; } - return size_differs; + if (biggest_witness_cnt != 1) + return -1; + + return biggest_witness_idx; } - -static int -afr_sh_mark_biggest_fool_as_source (afr_self_heal_t *sh, - afr_node_character *characters, - int child_count) +int +afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses, + afr_node_character *characters, + int32_t child_count, int32_t witness) { - int i = 0; - int biggest = 0; + int i = 0; + int nsources = 0; - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_FOOL) { - biggest = i; - break; - } - } + GF_ASSERT (sources); + GF_ASSERT (witnesses); + GF_ASSERT (characters); + GF_ASSERT (child_count > 0); for (i = 0; i < child_count; i++) { if (characters[i].type != AFR_NODE_FOOL) continue; - if (!sh->buf) - break; - - if (SIZE_GREATER (&sh->buf[i], &sh->buf[biggest])) { - biggest = i; + if (witness == witnesses[i]) { + sources[i] = 1; + nsources++; } } + return nsources; +} + + +int +afr_mark_fool_as_source_by_idx (int32_t *sources, int child_count, int idx) +{ + if (idx >= 0 && idx < child_count) { + sources[idx] = 1; + return 1; + } + return 0; +} + + +static int +afr_find_largest_file_size (struct iatt *bufs, int32_t *success_children, + int child_count) +{ + int idx = -1; + int i = -1; + int child = -1; + uint64_t max_size = 0; + uint64_t min_size = 0; + int num_children = 0; + + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + + child = success_children[i]; + if (bufs[child].ia_size > max_size) { + max_size = bufs[child].ia_size; + idx = child; + } + + if ((num_children == 0) || (bufs[child].ia_size < min_size)) { + min_size = bufs[child].ia_size; + } + + num_children++; + } + + /* If sizes are same for all of them, finding sources will have to + * happen with pending changelog. So return -1 + */ + if ((num_children > 1) && (min_size == max_size)) + return -1; + return idx; +} + + +static int +afr_find_newest_file (struct iatt *bufs, int32_t *success_children, + int child_count) +{ + int idx = -1; + int i = -1; + int child = -1; + uint64_t max_ctime = 0; + + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; - sh->sources[biggest] = 1; + child = success_children[i]; + if (bufs[child].ia_ctime > max_ctime) { + max_ctime = bufs[child].ia_ctime; + idx = child; + } + } - return 1; + return idx; } static int -afr_sh_mark_biggest_as_source (afr_self_heal_t *sh, int child_count) +afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, + afr_node_character *characters, + int32_t *success_children, + int child_count, struct iatt *bufs) +{ + int32_t biggest_witness = 0; + int nsources = 0; + int32_t *witnesses = NULL; + + GF_ASSERT (child_count > 0); + + biggest_witness = afr_find_largest_file_size (bufs, success_children, + child_count); + if (biggest_witness != -1) + goto found; + + witnesses = GF_CALLOC (child_count, sizeof (*witnesses), + gf_afr_mt_int32_t); + if (NULL == witnesses) { + nsources = -1; + goto out; + } + + afr_compute_witness_of_fools (witnesses, pending_matrix, characters, + child_count); + biggest_witness = afr_find_biggest_witness_among_fools (witnesses, + characters, + child_count); + if (biggest_witness != -1) + goto found; + + biggest_witness = afr_find_newest_file (bufs, success_children, + child_count); + +found: + nsources = afr_mark_fool_as_source_by_idx (sources, child_count, + biggest_witness); +out: + GF_FREE (witnesses); + return nsources; +} + +int +afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs, + int32_t *success_children, + unsigned int child_count, uint32_t uid) { - int biggest = 0; - int i; + int i = 0; + int nsources = 0; + int child = 0; for (i = 0; i < child_count; i++) { - if (!sh->buf) + if (-1 == success_children[i]) break; - if (SIZE_GREATER (&sh->buf[i], &sh->buf[biggest])) { - biggest = i; + child = success_children[i]; + if (uid == bufs[child].ia_uid) { + sources[child] = 1; + nsources++; } } + return nsources; +} - sh->sources[biggest] = 1; +int +afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *success_children, + unsigned int child_count) +{ + int i = 0; + int smallest = -1; + int child = 0; - return 1; + for (i = 0; i < child_count; i++) { + if (-1 == success_children[i]) + break; + child = success_children[i]; + if ((smallest == -1) || + (bufs[child].ia_uid < bufs[smallest].ia_uid)) { + smallest = child; + } + } + return smallest; } - static int -afr_sh_mark_loweia_uid_as_source (afr_self_heal_t *sh, int child_count) +afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *success_children, + int child_count, int32_t *sources) { - uid_t smallest = 0; - int i; + int nsources = 0; + int smallest = 0; - for (i = 0; i < child_count; i++) { - if (!sh->buf) + smallest = afr_get_child_with_lowest_uid (bufs, success_children, + child_count); + if (smallest < 0) { + nsources = -1; + goto out; + } + nsources = afr_mark_child_as_source_by_uid (sources, bufs, + success_children, child_count, + bufs[smallest].ia_uid); +out: + return nsources; +} + +int +afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children, + struct iatt *bufs) +{ + afr_private_t *priv = NULL; + int i = 0; + int child = -1; + int read_child = -1; + + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + child = success_children[i]; + if (child < 0) break; + if (read_child < 0) + read_child = child; + else if (bufs[read_child].ia_size < bufs[child].ia_size) + read_child = child; + } + return read_child; +} - if (sh->buf[i].ia_uid < sh->buf[smallest].ia_uid) { - smallest = i; +int +afr_sh_mark_zero_size_file_as_sink (struct iatt *bufs, int32_t *success_children, + int child_count, int32_t *sources) +{ + int nsources = 0; + int i = 0; + int child = 0; + gf_boolean_t sink_exists = _gf_false; + gf_boolean_t source_exists = _gf_false; + int source = -1; + + for (i = 0; i < child_count; i++) { + child = success_children[i]; + if (child < 0) + break; + if (!bufs[child].ia_size) { + sink_exists = _gf_true; + continue; + } + if (!source_exists) { + source_exists = _gf_true; + source = child; + continue; + } + if (bufs[source].ia_size != bufs[child].ia_size) { + nsources = -1; + goto out; } } + if (!source_exists && !sink_exists) { + nsources = -1; + goto out; + } + + if (!source_exists || !sink_exists) + goto out; - sh->sources[smallest] = 1; + for (i = 0; i < child_count; i++) { + child = success_children[i]; + if (child < 0) + break; + if (bufs[child].ia_size) { + sources[child] = 1; + nsources++; + } + } +out: + return nsources; +} - return 1; +char * +afr_get_character_str (afr_node_type type) +{ + char *character = NULL; + + switch (type) { + case AFR_NODE_INNOCENT: + character = "innocent"; + break; + case AFR_NODE_FOOL: + character = "fool"; + break; + case AFR_NODE_WISE: + character = "wise"; + break; + default: + character = "invalid"; + break; + } + return character; } +afr_node_type +afr_find_child_character_type (int32_t *pending_row, int32_t child, + unsigned int child_count) +{ + afr_node_type type = AFR_NODE_INVALID; + + GF_ASSERT ((child >= 0) && (child < child_count)); + + if (afr_sh_is_innocent (pending_row, child_count)) + type = AFR_NODE_INNOCENT; + else if (afr_sh_is_fool (pending_row, child, child_count)) + type = AFR_NODE_FOOL; + else if (afr_sh_is_wise (pending_row, child, child_count)) + type = AFR_NODE_WISE; + return type; +} int -afr_sh_mark_sources (afr_self_heal_t *sh, int child_count, - afr_self_heal_type type) +afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, + int32_t **pending_matrix, int32_t *sources, + int32_t *success_children, afr_transaction_type type, + int32_t *subvol_status, gf_boolean_t ignore_ignorant) { - int i = 0; + afr_private_t *priv = NULL; + afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; + int nsources = -1; + unsigned char *ignorant_subvols = NULL; + unsigned int child_count = 0; - int32_t ** pending_matrix; - int * sources; + priv = this->private; + child_count = priv->child_count; - int size_differs = 0; + if (afr_get_children_count (success_children, priv->child_count) == 0) + goto out; - pending_matrix = sh->pending_matrix; - sources = sh->sources; + if (!ignore_ignorant) { + ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), + child_count, gf_afr_mt_char); + if (NULL == ignorant_subvols) + goto out; + } - int nsources = 0; + afr_build_pending_matrix (priv->pending_key, pending_matrix, + ignorant_subvols, xattr, type, + priv->child_count); - /* stores the 'characters' (innocent, fool, wise) of the nodes */ - afr_node_character * - characters = GF_CALLOC (sizeof (afr_node_character), - child_count, - gf_afr_mt_afr_node_character) ; + if (!ignore_ignorant) + afr_mark_ignorant_subvols_as_pending (pending_matrix, + ignorant_subvols, + priv->child_count); + sh_type = afr_self_heal_type_for_transaction (type); + if (AFR_SELF_HEAL_INVALID == sh_type) + goto out; - /* start clean */ - for (i = 0; i < child_count; i++) { - sources[i] = 0; - } - - for (i = 0; i < child_count; i++) { - if (afr_sh_is_innocent (pending_matrix[i], child_count)) { - characters[i].type = AFR_NODE_INNOCENT; + afr_sh_print_pending_matrix (pending_matrix, this); - } else if (afr_sh_is_fool (pending_matrix[i], i, child_count)) { - characters[i].type = AFR_NODE_FOOL; + nsources = afr_mark_sources (this, sources, pending_matrix, bufs, + sh_type, success_children, subvol_status); +out: + GF_FREE (ignorant_subvols); + return nsources; +} - } else if (afr_sh_is_wise (pending_matrix[i], i, child_count)) { - characters[i].type = AFR_NODE_WISE; +void +afr_find_character_types (afr_node_character *characters, + int32_t **pending_matrix, int32_t *success_children, + unsigned int child_count) +{ + afr_node_type type = AFR_NODE_INVALID; + int child = 0; + int i = 0; - } else { - gf_log ("[module:replicate]", GF_LOG_ERROR, - "Could not determine the state of subvolume %d!" - " (This message should never appear." - " Please file a bug report to " - "<gluster-devel@nongnu.org>.)", i); - } + for (i = 0; i < child_count; i++) { + child = success_children[i]; + if (child == -1) + break; + type = afr_find_child_character_type (pending_matrix[child], + child, child_count); + characters[child].type = type; } +} - if (type == AFR_SELF_HEAL_DATA) { - size_differs = afr_sh_mark_if_size_differs (sh, child_count); +void +afr_mark_success_children_sources (int32_t *sources, int32_t *success_children, + unsigned int child_count) +{ + int i = 0; + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + sources[success_children[i]] = 1; } +} +/** + * mark_sources: Mark all 'source' nodes and return number of source + * nodes found + * + * A node (a row in the pending matrix) belongs to one of + * three categories: + * + * M is the pending matrix. + * + * 'innocent' - M[i] is all zeroes + * 'fool' - M[i] has i'th element = 1 (self-reference) + * 'wise' - M[i] has i'th element = 0, others are 1 or 0. + * + * All 'innocent' nodes are sinks. If all nodes are innocent, no self-heal is + * needed. + * + * A 'wise' node can be a source. If two 'wise' nodes conflict, it is + * a split-brain. If one wise node refers to the other but the other doesn't + * refer back, the referrer is a source. + * + * All fools are sinks, unless there are no 'wise' nodes. In that case, + * one of the fools is made a source. + */ - if ((type == AFR_SELF_HEAL_METADATA) - && afr_sh_all_nodes_innocent (characters, child_count)) { +int +afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, + struct iatt *bufs, afr_self_heal_type type, + int32_t *success_children, int32_t *subvol_status) +{ + /* stores the 'characters' (innocent, fool, wise) of the nodes */ + afr_node_character *characters = NULL; + int nsources = -1; + unsigned int child_count = 0; + afr_private_t *priv = NULL; - nsources = afr_sh_mark_loweia_uid_as_source (sh, child_count); + priv = this->private; + child_count = priv->child_count; + characters = GF_CALLOC (sizeof (afr_node_character), + child_count, gf_afr_mt_afr_node_character); + if (!characters) goto out; - } + this = THIS; + + /* start clean */ + memset (sources, 0, sizeof (*sources) * child_count); + nsources = 0; + afr_find_character_types (characters, pending_matrix, success_children, + child_count); if (afr_sh_all_nodes_innocent (characters, child_count)) { - if (size_differs) { - nsources = afr_sh_mark_biggest_as_source (sh, - child_count); + switch (type) { + case AFR_SELF_HEAL_METADATA: + nsources = afr_sh_mark_lowest_uid_as_source (bufs, + success_children, + child_count, + sources); + break; + case AFR_SELF_HEAL_DATA: + nsources = afr_sh_mark_zero_size_file_as_sink (bufs, + success_children, + child_count, + sources); + if ((nsources < 0) && subvol_status) + *subvol_status |= SPLIT_BRAIN; + break; + default: + break; } + goto out; + } - } else if (afr_sh_wise_nodes_exist (characters, child_count)) { + if (afr_sh_wise_nodes_exist (characters, child_count)) { afr_sh_compute_wisdom (pending_matrix, characters, child_count); if (afr_sh_wise_nodes_conflict (characters, child_count)) { - /* split-brain */ - + if (subvol_status) + *subvol_status |= SPLIT_BRAIN; nsources = -1; - goto out; - } else { - nsources = afr_sh_mark_wisest_as_sources (sources, + nsources = afr_sh_mark_wisest_as_sources (sources, characters, child_count); } } else { - nsources = afr_sh_mark_biggest_fool_as_source (sh, characters, - child_count); + if (subvol_status) + *subvol_status |= ALL_FOOLS; + nsources = afr_mark_biggest_of_fools_as_source (sources, + pending_matrix, + characters, + success_children, + child_count, bufs); } out: + if (nsources == 0) + afr_mark_success_children_sources (sources, success_children, + child_count); GF_FREE (characters); - return nsources; + gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources); + return nsources; } - void afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, - int32_t *delta_matrix[], int success[], + int32_t *delta_matrix[], unsigned char success[], int child_count, afr_transaction_type type) { - int i = 0; - int j = 0; - int k = 0; - - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3]; - void * pending_raw = NULL; - int ret = 0; - - /* start clean */ - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - delta_matrix[i][j] = 0; - } - } - - for (i = 0; i < child_count; i++) { - pending_raw = NULL; - - for (j = 0; j < child_count; j++) { - ret = dict_get_ptr (xattr[i], priv->pending_key[j], - &pending_raw); - if (ret < 0) - gf_log ("afr_sh_pending_to_delta", - GF_LOG_WARNING, - "Unable to get dict value."); + int tgt = 0; + int src = 0; + int value = 0; - if (!success[j]) - continue; - - k = afr_index_for_transaction_type (type); + afr_build_pending_matrix (priv->pending_key, delta_matrix, NULL, + xattr, type, priv->child_count); - if (pending_raw) { - memcpy (pending, pending_raw, sizeof(pending)); - delta_matrix[i][j] = -(ntoh32 (pending[k])); - } else { - delta_matrix[i][j] = 0; + /* + * The algorithm here has two parts. First, for each subvol indexed + * as tgt, we try to figure out what count everyone should have for it. + * If the self-heal succeeded, that's easy; the value is zero. + * Otherwise, the value is the maximum of the succeeding nodes' counts. + * Once we know the value, we loop through (possibly for a second time) + * setting each count to the difference so that when we're done all + * succeeding nodes will have the same count for tgt. + */ + for (tgt = 0; tgt < priv->child_count; ++tgt) { + value = 0; + if (!success[tgt]) { + /* Find the maximum. */ + for (src = 0; src < priv->child_count; ++src) { + if (!success[src]) { + continue; + } + if (delta_matrix[src][tgt] > value) { + value = delta_matrix[src][tgt]; + } } - } - } + /* Force everyone who succeeded to the chosen value. */ + for (src = 0; src < priv->child_count; ++src) { + if (success[src]) { + delta_matrix[src][tgt] = value + - delta_matrix[src][tgt]; + } + else { + delta_matrix[src][tgt] = 0; + } + } + } } int -afr_sh_delta_to_xattr (afr_private_t *priv, +afr_sh_delta_to_xattr (xlator_t *this, int32_t *delta_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type) + int child_count, afr_transaction_type type) { - int i = 0; - int j = 0; - int k = 0; - - int ret = 0; + int i = 0; + int j = 0; + int k = 0; + int ret = 0; + int32_t *pending = NULL; + int32_t *local_pending = NULL; + afr_private_t *priv = NULL; - int32_t *pending = 0; - - for (i = 0; i < child_count; i++) { - if (!xattr[i]) - continue; + priv = this->private; + for (i = 0; i < child_count; i++) { + if (!xattr[i]) + continue; - for (j = 0; j < child_count; j++) { + local_pending = NULL; + for (j = 0; j < child_count; j++) { pending = GF_CALLOC (sizeof (int32_t), 3, gf_afr_mt_int32_t); + + if (!pending) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate pending entry " + "for %s[%d] on %s", + priv->pending_key[j], type, + priv->children[i]->name); + continue; + } /* 3 = data+metadata+entry */ k = afr_index_for_transaction_type (type); - pending[k] = hton32 (delta_matrix[i][j]); + pending[k] = hton32 (delta_matrix[i][j]); + if (j == i) { + local_pending = pending; + continue; + } ret = dict_set_bin (xattr[i], priv->pending_key[j], pending, - 3 * sizeof (int32_t)); - if (ret < 0) - gf_log ("afr_sh_delta_to_xattr", - GF_LOG_WARNING, + AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "Unable to set dict value."); - } - } - return 0; + GF_FREE (pending); + } + } + if (local_pending) { + ret = dict_set_bin (xattr[i], priv->pending_key[i], + local_pending, + AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "Unable to set dict value."); + GF_FREE (local_pending); + } + } + } + return 0; } int -afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this) +afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) { - afr_private_t *priv = NULL; - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3]; - void *pending_raw = NULL; - - int ret = -1; - int i = 0; - int j = 0; - - priv = this->private; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); + local = frame->local; + sh = &local->self_heal; - if (ret != 0) - return 0; + afr_sh_reset (frame, this); - memcpy (pending, pending_raw, sizeof(pending)); - j = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); + if (local->unhealable) { + gf_log (this->name, GF_LOG_DEBUG, + "split brain found, aborting selfheal of %s", + local->loc.path); + } - if (pending[j]) - return 1; + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { + sh->completion_cbk (frame, this); + } else { + gf_log (this->name, GF_LOG_TRACE, + "proceeding to metadata check on %s", + local->loc.path); + afr_self_heal_metadata (frame, this); } - return 0; + return 0; } -int -afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this) +static int +afr_sh_missing_entries_finish (call_frame_t *frame, xlator_t *this) { - afr_private_t *priv = NULL; - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3]; - void *pending_raw = NULL; - - int ret = -1; - int i = 0; - int j = 0; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; - if (ret != 0) - return 0; - - memcpy (pending, pending_raw, sizeof(pending)); - j = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); + local = frame->local; + int_lock = &local->internal_lock; - if (pending[j]) - return 1; - } + int_lock->lock_cbk = afr_sh_missing_entries_done; + afr_unlock (frame, this); - return 0; + return 0; } - int -afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this) +afr_sh_common_create (afr_self_heal_t *sh, unsigned int child_count) { - afr_private_t *priv = NULL; - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3]; - void *pending_raw = NULL; - - int ret = -1; - int i = 0; - int j = 0; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); + int ret = -ENOMEM; + sh->buf = GF_CALLOC (child_count, sizeof (*sh->buf), + gf_afr_mt_iatt); + if (!sh->buf) + goto out; + sh->parentbufs = GF_CALLOC (child_count, sizeof (*sh->parentbufs), + gf_afr_mt_iatt); + if (!sh->parentbufs) + goto out; + sh->child_errno = GF_CALLOC (child_count, sizeof (*sh->child_errno), + gf_afr_mt_int); + if (!sh->child_errno) + goto out; + sh->success_children = afr_children_create (child_count); + if (!sh->success_children) + goto out; + sh->fresh_children = afr_children_create (child_count); + if (!sh->fresh_children) + goto out; + sh->xattr = GF_CALLOC (child_count, sizeof (*sh->xattr), + gf_afr_mt_dict_t); + if (!sh->xattr) + goto out; + ret = 0; +out: + return ret; +} - if (ret != 0) - return 0; - - memcpy (pending, pending_raw, sizeof(pending)); - j = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); +void +afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *buf, + dict_t *xattr, struct iatt *postparent, + loc_t *loc) +{ + int child_index = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; - if (pending[j]) - return 1; + local = frame->local; + priv = this->private; + sh = &local->self_heal; + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (op_ret == 0) { + sh->buf[child_index] = *buf; + sh->parentbufs[child_index] = *postparent; + sh->success_children[sh->success_count] = child_index; + sh->success_count++; + sh->xattr[child_index] = dict_ref (xattr); + } else { + gf_log (this->name, GF_LOG_DEBUG, "path %s on subvolume" + " %s => -1 (%s)", loc->path, + priv->children[child_index]->name, + strerror (op_errno)); + local->self_heal.child_errno[child_index] = op_errno; + } } - - return 0; + UNLOCK (&frame->lock); + return; } - -/** - * is_matrix_zero - return true if pending matrix is all zeroes - */ - -int -afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count) +gf_boolean_t +afr_valid_ia_type (ia_type_t ia_type) { - int i, j; - - for (i = 0; i < child_count; i++) - for (j = 0; j < child_count; j++) - if (pending_matrix[i][j]) - return 0; - return 1; + switch (ia_type) { + case IA_IFSOCK: + case IA_IFREG: + case IA_IFBLK: + case IA_IFCHR: + case IA_IFIFO: + case IA_IFLNK: + case IA_IFDIR: + return _gf_true; + default: + return _gf_false; + } + return _gf_false; } - int -afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) +afr_impunge_frame_create (call_frame_t *frame, xlator_t *this, + int active_source, call_frame_t **impunge_frame) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; + afr_local_t *local = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int32_t op_errno = 0; + afr_private_t *priv = NULL; + int ret = 0; + call_frame_t *new_frame = NULL; + + op_errno = ENOMEM; + priv = this->private; + new_frame = copy_frame (frame); + if (!new_frame) { + goto out; + } + + AFR_LOCAL_ALLOC_OR_GOTO (impunge_local, out); - local = frame->local; - sh = &local->self_heal; - priv = this->private; + local = frame->local; + new_frame->local = impunge_local; + impunge_sh = &impunge_local->self_heal; + impunge_sh->sh_frame = frame; + impunge_sh->active_source = active_source; + impunge_local->child_up = memdup (local->child_up, + sizeof (*local->child_up) * + priv->child_count); + if (!impunge_local->child_up) + goto out; -// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); - memset (sh->buf, 0, sizeof (struct iatt) * priv->child_count); + impunge_local->pending = afr_matrix_create (priv->child_count, + AFR_NUM_CHANGE_LOGS); + if (!impunge_local->pending) + goto out; - for (i = 0; i < priv->child_count; i++) { - sh->locked_nodes[i] = 0; + ret = afr_sh_common_create (impunge_sh, priv->child_count); + if (ret) { + op_errno = -ret; + goto out; } + op_errno = 0; + *impunge_frame = new_frame; +out: + if (op_errno && new_frame) + AFR_STACK_DESTROY (new_frame); + return -op_errno; +} - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } +void +afr_sh_missing_entry_call_impunge_recreate (call_frame_t *frame, xlator_t *this, + struct iatt *buf, + struct iatt *postparent, + afr_impunge_done_cbk_t impunge_done) +{ + call_frame_t *impunge_frame = NULL; + afr_local_t *local = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *sh = NULL; + afr_self_heal_t *impunge_sh = NULL; + int ret = 0; + unsigned int enoent_count = 0; + afr_private_t *priv = NULL; + int i = 0; + int32_t op_errno = 0; - if (local->govinda_gOvinda) { - gf_log (this->name, GF_LOG_TRACE, - "aborting selfheal of %s", - local->loc.path); - sh->completion_cbk (frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to metadata check on %s", - local->loc.path); - afr_self_heal_metadata (frame, this); - } + local = frame->local; + sh = &local->self_heal; + priv = this->private; - return 0; + enoent_count = afr_errno_count (NULL, sh->child_errno, + priv->child_count, ENOENT); + if (!enoent_count) { + gf_log (this->name, GF_LOG_INFO, + "no missing files - %s. proceeding to metadata check", + local->loc.path); + goto out; + } + sh->impunge_done = impunge_done; + ret = afr_impunge_frame_create (frame, this, sh->source, &impunge_frame); + if (ret) + goto out; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + loc_copy (&impunge_local->loc, &local->loc); + ret = afr_build_parent_loc (&impunge_sh->parent_loc, + &impunge_local->loc, &op_errno); + if (ret) { + ret = -op_errno; + goto out; + } + impunge_local->call_count = enoent_count; + impunge_sh->entrybuf = sh->buf[sh->source]; + impunge_sh->parentbuf = sh->parentbufs[sh->source]; + for (i = 0; i < priv->child_count; i++) { + if (!impunge_local->child_up[i]) { + impunge_sh->child_errno[i] = ENOTCONN; + continue; + } + if (sh->child_errno[i] != ENOENT) { + impunge_sh->child_errno[i] = EEXIST; + continue; + } + } + for (i = 0; i < priv->child_count; i++) { + if (sh->child_errno[i] != ENOENT) + continue; + afr_sh_entry_impunge_create (impunge_frame, this, i); + enoent_count--; + } + GF_ASSERT (!enoent_count); + return; +out: + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, " + "reason: %s", local->loc.path, strerror (-ret)); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + } + afr_sh_missing_entries_finish (frame, this); } - -static int -sh_missing_entries_finish (call_frame_t *frame, xlator_t *this) +int +afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this, + int32_t op_ret, int32_t op_errno) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->lock_cbk = afr_sh_missing_entries_done; - afr_unlock (frame, this); + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; - return 0; + local = frame->local; + sh = &local->self_heal; + if (op_ret < 0) + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_missing_entries_finish (frame, this); + return 0; } - static int -sh_destroy_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int op_errno, - struct iatt *preop, struct iatt *postop) +sh_missing_entries_create (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - - loc_t *parent_loc = cookie; - - int call_count = 0; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int type = 0; + struct iatt *buf = NULL; + struct iatt *postparent = NULL; local = frame->local; + sh = &local->self_heal; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "setattr on %s failed: %s", - local->loc.path, strerror (op_errno)); - } + buf = &sh->buf[sh->source]; + postparent = &sh->parentbufs[sh->source]; - if (parent_loc) { - loc_wipe (parent_loc); - GF_FREE (parent_loc); + type = buf->ia_type; + if (!afr_valid_ia_type (type)) { + gf_log (this->name, GF_LOG_ERROR, + "%s: unknown file type: 0%o", local->loc.path, type); + afr_set_local_for_unhealable (local); + afr_sh_missing_entries_finish (frame, this); + goto out; } - call_count = afr_frame_return (frame); - - if (call_count == 0) { - STACK_DESTROY (frame->root); - } - - return 0; + afr_sh_missing_entry_call_impunge_recreate (frame, this, + buf, postparent, + afr_sh_create_entry_cbk); +out: + return 0; } +void +afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + ia_type_t ia_type = IA_INVAL; + int32_t nsources = 0; + loc_t *loc = NULL; + int32_t subvol_status = 0; + afr_transaction_type txn_type = AFR_DATA_TRANSACTION; + gf_boolean_t split_brain = _gf_false; + int read_child = -1; -static int -sh_missing_entries_newentry_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - call_frame_t *setattr_frame = NULL; - int call_count = 0; - int child_index = 0; - - loc_t *parent_loc = NULL; - - struct iatt stbuf; - int32_t valid; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - stbuf.ia_atime = sh->buf[sh->source].ia_atime; - stbuf.ia_atime_nsec = sh->buf[sh->source].ia_atime_nsec; - stbuf.ia_mtime = sh->buf[sh->source].ia_mtime; - stbuf.ia_mtime_nsec = sh->buf[sh->source].ia_mtime_nsec; - - stbuf.ia_uid = sh->buf[sh->source].ia_uid; - stbuf.ia_gid = sh->buf[sh->source].ia_gid; - - valid = GF_SET_ATTR_UID | GF_SET_ATTR_GID | - GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - - if (op_ret == 0) { - setattr_frame = copy_frame (frame); - - setattr_frame->local = GF_CALLOC (1, sizeof (afr_local_t), - gf_afr_mt_afr_local_t); - - ((afr_local_t *)setattr_frame->local)->call_count = 2; - - gf_log (this->name, GF_LOG_TRACE, - "setattr (%s) on subvolume %s", - local->loc.path, priv->children[child_index]->name); - - STACK_WIND_COOKIE (setattr_frame, sh_destroy_cbk, - (void *) (long) 0, - priv->children[child_index], - priv->children[child_index]->fops->setattr, - &local->loc, &stbuf, valid); - - valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - parent_loc = GF_CALLOC (1, sizeof (*parent_loc), - gf_afr_mt_loc_t); - afr_build_parent_loc (parent_loc, &local->loc); - - STACK_WIND_COOKIE (setattr_frame, sh_destroy_cbk, - (void *) (long) parent_loc, - priv->children[child_index], - priv->children[child_index]->fops->setattr, - parent_loc, &sh->parentbuf, valid); - } + local = frame->local; + sh = &local->self_heal; + priv = this->private; + loc = &local->loc; - call_count = afr_frame_return (frame); + if (op_ret < 0) { + if (op_errno == EIO) { + afr_set_local_for_unhealable (local); + } + // EIO can happen if finding the fresh parent dir failed + goto out; + } - if (call_count == 0) { - sh_missing_entries_finish (frame, this); - } + //now No chance for the ia_type to conflict + ia_type = sh->buf[sh->success_children[0]].ia_type; + txn_type = afr_transaction_type_get (ia_type); + nsources = afr_build_sources (this, sh->xattr, sh->buf, + sh->pending_matrix, sh->sources, + sh->success_children, txn_type, + &subvol_status, _gf_false); + if (nsources < 0) { + gf_log (this->name, GF_LOG_INFO, "No sources for dir of %s," + " in missing entry self-heal, continuing with the rest" + " of the self-heals", local->loc.path); + if (subvol_status & SPLIT_BRAIN) { + split_brain = _gf_true; + switch (txn_type) { + case AFR_DATA_TRANSACTION: + nsources = 1; + sh->sources[sh->success_children[0]] = 1; + break; + case AFR_ENTRY_TRANSACTION: + read_child = afr_get_no_xattr_dir_read_child + (this, + sh->success_children, + sh->buf); + sh->sources[read_child] = 1; + nsources = 1; + break; + default: + op_errno = EIO; + goto out; + } + } else { + op_errno = EIO; + goto out; + } + } - return 0; -} + afr_get_fresh_children (sh->success_children, sh->sources, + sh->fresh_children, priv->child_count); + sh->source = sh->fresh_children[0]; + if (sh->source == -1) { + gf_log (this->name, GF_LOG_DEBUG, "No active sources found."); + op_errno = EIO; + goto out; + } + if (sh->gfid_sh_success_cbk) + sh->gfid_sh_success_cbk (frame, this); + sh->type = sh->buf[sh->source].ia_type; + if (uuid_is_null (loc->inode->gfid)) + uuid_copy (loc->gfid, sh->buf[sh->source].ia_gfid); + if (split_brain) { + afr_sh_missing_entries_finish (frame, this); + } else { + sh_missing_entries_create (frame, this); + } + return; +out: + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_set_error (sh, op_errno); + afr_sh_missing_entries_finish (frame, this); + return; +} static int -sh_missing_entries_mknod (call_frame_t *frame, xlator_t *this) +afr_sh_common_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xattr, + struct iatt *postparent) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - int ret = 0; - int enoent_count = 0; - int call_count = 0; - mode_t st_mode = 0; - dev_t ia_dev = 0; - dict_t *dict = NULL; + int call_count = 0; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + local = frame->local; + sh = &local->self_heal; + priv = this->private; - for (i = 0; i < priv->child_count; i++) - if (sh->child_errno[i] == ENOENT) - enoent_count++; + afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, + op_errno, inode, buf, xattr, + postparent, &sh->lookup_loc); + call_count = afr_frame_return (frame); - call_count = enoent_count; - local->call_count = call_count; + if (call_count) + goto out; + op_ret = -1; + if (!sh->success_count) { + op_errno = afr_resultant_errno_get (NULL, sh->child_errno, + priv->child_count); + gf_log (this->name, GF_LOG_ERROR, "Failed to lookup %s, " + "reason %s", sh->lookup_loc.path, + strerror (op_errno)); + goto done; + } - st_mode = st_mode_from_ia (sh->buf[sh->source].ia_prot, - sh->buf[sh->source].ia_type); - ia_dev = sh->buf[sh->source].ia_dev; + if ((sh->lookup_flags & AFR_LOOKUP_FAIL_CONFLICTS) && + (afr_conflicting_iattrs (sh->buf, sh->success_children, + priv->child_count, + sh->lookup_loc.path, this->name))) { + op_errno = EIO; + gf_log (this->name, GF_LOG_ERROR, "Conflicting entries " + "for %s", sh->lookup_loc.path); + goto done; + } - gf_log (this->name, GF_LOG_TRACE, - "mknod %s mode 0%o on %d subvolumes", - local->loc.path, st_mode, enoent_count); + if ((sh->lookup_flags & AFR_LOOKUP_FAIL_MISSING_GFIDS) && + (afr_gfid_missing_count (this->name, sh->success_children, + sh->buf, priv->child_count, + sh->lookup_loc.path))) { + op_errno = ENODATA; + gf_log (this->name, GF_LOG_ERROR, "Missing Gfids " + "for %s", sh->lookup_loc.path); + goto done; + } + op_ret = 0; - dict = dict_new (); - if (!dict) - gf_log (this->name, GF_LOG_ERROR, "out of memory"); +done: + sh->lookup_done (frame, this, op_ret, op_errno); +out: + return 0; +} - ret = afr_set_dict_gfid (dict, sh->buf[sh->source].ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, "gfid set failed"); - - for (i = 0; i < priv->child_count; i++) { - if (sh->child_errno[i] == ENOENT) { - STACK_WIND_COOKIE (frame, - sh_missing_entries_newentry_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mknod, - &local->loc, st_mode, ia_dev, dict); - if (!--call_count) - break; - } - } +int +afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child, + int32_t op_ret, int32_t op_errno) +{ + int call_count = 0; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; - if (dict) - dict_unref (dict); + local = frame->local; + sh = &local->self_heal; - return 0; + GF_ASSERT (sh->post_remove_call); + if ((op_ret == -1) && (op_errno != ENOENT)) { + gf_log (this->name, GF_LOG_ERROR, + "purge entry %s failed, on child %d reason, %s", + local->loc.path, child, strerror (op_errno)); + LOCK (&frame->lock); + { + afr_sh_set_error (sh, EIO); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + } + UNLOCK (&frame->lock); + } + call_count = afr_frame_return (frame); + if (call_count == 0) + sh->post_remove_call (frame, this); + return 0; } - -static int -sh_missing_entries_mkdir (call_frame_t *frame, xlator_t *this) +void +afr_sh_call_entry_expunge_remove (call_frame_t *frame, xlator_t *this, + int child_index, struct iatt *buf, + struct iatt *parentbuf, + afr_expunge_done_cbk_t expunge_done) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - dict_t *dict = NULL; - int i = 0; - int ret = 0; - int enoent_count = 0; - int call_count = 0; - mode_t st_mode = 0; - + call_frame_t *expunge_frame = NULL; + afr_local_t *local = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *sh = NULL; + afr_self_heal_t *expunge_sh = NULL; + int32_t op_errno = 0; + int ret = 0; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + expunge_frame = copy_frame (frame); + if (!expunge_frame) { + goto out; + } - for (i = 0; i < priv->child_count; i++) - if (sh->child_errno[i] == ENOENT) - enoent_count++; + AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out); - call_count = enoent_count; - local->call_count = call_count; + local = frame->local; + sh = &local->self_heal; + expunge_frame->local = expunge_local; + expunge_sh = &expunge_local->self_heal; + expunge_sh->sh_frame = frame; + loc_copy (&expunge_local->loc, &local->loc); + ret = afr_build_parent_loc (&expunge_sh->parent_loc, + &expunge_local->loc, &op_errno); + if (ret) { + ret = -op_errno; + goto out; + } + sh->expunge_done = expunge_done; + afr_sh_entry_expunge_remove (expunge_frame, this, child_index, buf, + parentbuf); + return; +out: + gf_log (this->name, GF_LOG_ERROR, "Expunge of %s failed, reason: %s", + local->loc.path, strerror (op_errno)); + expunge_done (frame, this, child_index, -1, op_errno); +} - st_mode = st_mode_from_ia (sh->buf[sh->source].ia_prot, - sh->buf[sh->source].ia_type); +void +afr_sh_remove_stale_lookup_info (afr_self_heal_t *sh, int32_t *success_children, + int32_t *fresh_children, + unsigned int child_count) +{ + int i = 0; - dict = dict_new (); - if (!dict) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - sh_missing_entries_finish (frame, this); - return 0; + for (i = 0; i < child_count; i++) { + if (afr_is_child_present (success_children, child_count, i) && + !afr_is_child_present (fresh_children, child_count, i)) { + sh->child_errno[i] = ENOENT; + GF_ASSERT (sh->xattr[i]); + dict_unref (sh->xattr[i]); + sh->xattr[i] = NULL; + } } +} - ret = afr_set_dict_gfid (dict, sh->buf[sh->source].ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "inode gfid set failed"); +int +afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + local = frame->local; + sh = &local->self_heal; + priv = this->private; - gf_log (this->name, GF_LOG_TRACE, - "mkdir %s mode 0%o on %d subvolumes", - local->loc.path, st_mode, enoent_count); + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { + afr_sh_missing_entries_finish (frame, this); + } else { + if (afr_gfid_missing_count (this->name, sh->fresh_children, + sh->buf, priv->child_count, + local->loc.path)) { + afr_sh_common_lookup (frame, this, &local->loc, + afr_sh_missing_entries_lookup_done, + sh->sh_gfid_req, + AFR_LOOKUP_FAIL_CONFLICTS| + AFR_LOOKUP_FAIL_MISSING_GFIDS, + NULL); + } else { + //No need to set gfid so goto missing entries lookup done + //Behave as if you have done the lookup + afr_sh_remove_stale_lookup_info (sh, + sh->success_children, + sh->fresh_children, + priv->child_count); + afr_children_copy (sh->success_children, + sh->fresh_children, + priv->child_count); + afr_sh_missing_entries_lookup_done (frame, this, 0, 0); + } + } + return 0; +} - for (i = 0; i < priv->child_count; i++) { - if (sh->child_errno[i] == ENOENT) { - if (!strcmp (local->loc.path, "/")) { - /* We shouldn't try to create "/" */ +gf_boolean_t +afr_sh_purge_entry_condition (afr_local_t *local, afr_private_t *priv, + int child) +{ + afr_self_heal_t *sh = NULL; - sh_missing_entries_finish (frame, this); + sh = &local->self_heal; - return 0; - } else { - STACK_WIND_COOKIE (frame, - sh_missing_entries_newentry_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mkdir, - &local->loc, st_mode, dict); - if (!--call_count) - break; - } - } - } - - if (dict) - dict_unref (dict); + if (local->child_up[child] && + (!afr_is_child_present (sh->fresh_parent_dirs, priv->child_count, + child)) + && (sh->child_errno[child] != ENOENT)) + return _gf_true; - return 0; + return _gf_false; } +gf_boolean_t +afr_sh_purge_stale_entry_condition (afr_local_t *local, afr_private_t *priv, + int child) +{ + afr_self_heal_t *sh = NULL; -static int -sh_missing_entries_symlink (call_frame_t *frame, xlator_t *this, - const char *link, struct iatt *buf) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - dict_t *dict = NULL; - int i = 0; - int ret = 0; - int enoent_count = 0; - int call_count = 0; + sh = &local->self_heal; + if (local->child_up[child] && + (!afr_is_child_present (sh->fresh_children, priv->child_count, + child)) + && (sh->child_errno[child] != ENOENT)) + return _gf_true; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + return _gf_false; +} - for (i = 0; i < priv->child_count; i++) - if (sh->child_errno[i] == ENOENT) - enoent_count++; +void +afr_sh_purge_entry_common (call_frame_t *frame, xlator_t *this, + gf_boolean_t purge_condition (afr_local_t *local, + afr_private_t *priv, + int child)) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + int i = 0; + int call_count = 0; - call_count = enoent_count; - local->call_count = call_count; + local = frame->local; + sh = &local->self_heal; + priv = this->private; - dict = dict_new (); - if (!dict) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - sh_missing_entries_finish (frame, this); - return 0; + for (i = 0; i < priv->child_count; i++) { + if (purge_condition (local, priv, i)) + call_count++; } - ret = afr_set_dict_gfid (dict, buf->ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "dict gfid set failed"); - - gf_log (this->name, GF_LOG_TRACE, - "symlink %s -> %s on %d subvolumes", - local->loc.path, link, enoent_count); - - for (i = 0; i < priv->child_count; i++) { - if (sh->child_errno[i] == ENOENT) { - STACK_WIND_COOKIE (frame, - sh_missing_entries_newentry_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->symlink, - link, &local->loc, dict); - if (!--call_count) - break; - } - } + if (call_count == 0) { + sh->post_remove_call (frame, this); + goto out; + } - return 0; + local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (!purge_condition (local, priv, i)) + continue; + gf_log (this->name, GF_LOG_INFO, "purging the stale entry %s " + "on %s", local->loc.path, priv->children[i]->name); + afr_sh_call_entry_expunge_remove (frame, this, + (long) i, &sh->buf[i], + &sh->parentbufs[i], + afr_sh_remove_entry_cbk); + } +out: + return; } - -static int -sh_missing_entries_readlink_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - const char *link, struct iatt *sbuf) +void +afr_sh_purge_entry (call_frame_t *frame, xlator_t *this) { - if (op_ret > 0) - sh_missing_entries_symlink (frame, this, link, sbuf); - else - sh_missing_entries_finish (frame, this); + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; - return 0; -} + local = frame->local; + sh = &local->self_heal; + sh->post_remove_call = afr_sh_missing_entries_finish; + afr_sh_purge_entry_common (frame, this, afr_sh_purge_entry_condition); +} -static int -sh_missing_entries_readlink (call_frame_t *frame, xlator_t *this) +void +afr_sh_purge_stale_entry (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + local = frame->local; + sh = &local->self_heal; + priv = this->private; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + sh->post_remove_call = afr_sh_purge_stale_entries_done; - STACK_WIND (frame, sh_missing_entries_readlink_cbk, - priv->children[sh->source], - priv->children[sh->source]->fops->readlink, - &local->loc, 4096); + for (i = 0; i < priv->child_count; i++) { + if (afr_is_child_present (sh->fresh_children, + priv->child_count, i)) + continue; - return 0; -} + if ((!local->child_up[i]) || sh->child_errno[i] != 0) + continue; + GF_ASSERT (!uuid_is_null (sh->entrybuf.ia_gfid) || + uuid_is_null (sh->buf[i].ia_gfid)); -static int -sh_missing_entries_create (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int type = 0; - int i = 0; - afr_private_t *priv = NULL; - int enoent_count = 0; - int govinda_gOvinda = 0; + if ((sh->entrybuf.ia_type != sh->buf[i].ia_type) || + (uuid_compare (sh->buf[i].ia_gfid, + sh->entrybuf.ia_gfid))) + continue; + afr_children_add_child (sh->fresh_children, i, + priv->child_count); - local = frame->local; - sh = &local->self_heal; - priv = this->private; + } + afr_sh_purge_entry_common (frame, this, + afr_sh_purge_stale_entry_condition); +} - for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i]) - continue; +void +afr_sh_save_child_iatts_from_policy (int32_t *children, struct iatt *bufs, + struct iatt *save, + unsigned int child_count) +{ + int i = 0; + int child = 0; + gf_boolean_t saved = _gf_false; - if (sh->child_errno[i]) { - if (sh->child_errno[i] == ENOENT) - enoent_count++; - } else { - if (type) { - if (type != sh->buf[i].ia_type) { - gf_log (this->name, GF_LOG_TRACE, - "file %s is govinda!", - local->loc.path); - - govinda_gOvinda = 1; - } - } else { - sh->source = i; - type = sh->buf[i].ia_type; - } - } - } + GF_ASSERT (save); + //if iatt buf with gfid exists sets it + for (i = 0; i < child_count; i++) { + child = children[i]; + if (child == -1) + break; + *save = bufs[child]; + saved = _gf_true; + if (!uuid_is_null (save->ia_gfid)) + break; + } + GF_ASSERT (saved); +} - if (govinda_gOvinda) { - gf_log (this->name, GF_LOG_ERROR, - "conflicting filetypes exist for path %s. returning.", - local->loc.path); +void +afr_get_children_of_fresh_parent_dirs (afr_self_heal_t *sh, + unsigned int child_count) +{ + afr_children_intersection_get (sh->success_children, + sh->fresh_parent_dirs, + sh->sources, child_count); + afr_get_fresh_children (sh->success_children, sh->sources, + sh->fresh_children, child_count); + memset (sh->sources, 0, sizeof (*sh->sources) * child_count); +} - local->govinda_gOvinda = 1; - sh_missing_entries_finish (frame, this); - return 0; - } +void +afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int32_t fresh_child_enoents = 0; + int32_t fresh_parent_count = 0; - if (!type) { - gf_log (this->name, GF_LOG_ERROR, - "no source found for %s. all nodes down?. returning.", - local->loc.path); - /* subvolumes down and/or file does not exist */ - sh_missing_entries_finish (frame, this); - return 0; - } + local = frame->local; + sh = &local->self_heal; + priv = this->private; - if (enoent_count == 0) { - gf_log (this->name, GF_LOG_ERROR, - "no missing files - %s. proceeding to metadata check", - local->loc.path); - /* proceed to next step - metadata self-heal */ - sh_missing_entries_finish (frame, this); - return 0; - } + if (op_ret < 0) + goto fail; + afr_get_children_of_fresh_parent_dirs (sh, priv->child_count); + fresh_parent_count = afr_get_children_count (sh->fresh_parent_dirs, + priv->child_count); + //we need the enoent count of the subvols present in fresh_parent_dirs + fresh_child_enoents = afr_errno_count (sh->fresh_parent_dirs, + sh->child_errno, + priv->child_count, ENOENT); + if (fresh_child_enoents == fresh_parent_count) { + afr_sh_set_error (sh, ENOENT); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_purge_entry (frame, this); + } else if (!afr_conflicting_iattrs (sh->buf, sh->fresh_children, + priv->child_count, local->loc.path, + this->name)) { + afr_sh_save_child_iatts_from_policy (sh->fresh_children, + sh->buf, &sh->entrybuf, + priv->child_count); + afr_update_gfid_from_iatts (sh->sh_gfid_req, sh->buf, + sh->fresh_children, + priv->child_count); + afr_sh_purge_stale_entry (frame, this); + } else { + op_errno = EIO; + afr_set_local_for_unhealable (local); + goto fail; + } - switch (type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - sh_missing_entries_mknod (frame, this); - break; - case IA_IFLNK: - sh_missing_entries_readlink (frame, this); - break; - case IA_IFDIR: - sh_missing_entries_mkdir (frame, this); - break; - default: - gf_log (this->name, GF_LOG_ERROR, - "unknown file type: 0%o", type); - local->govinda_gOvinda = 1; - sh_missing_entries_finish (frame, this); - } + return; - return 0; +fail: + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_set_error (sh, op_errno); + afr_sh_missing_entries_finish (frame, this); + return; } +static void +afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int enoent_count = 0; + int nsources = 0; + int source = -1; + int32_t subvol_status = 0; -static int -sh_missing_entries_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - int child_index = 0; - afr_local_t *local = NULL; - int call_count = 0; - afr_private_t *priv = NULL; - - - local = frame->local; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "path %s on subvolume %s is of mode 0%o", - local->loc.path, - priv->children[child_index]->name, - buf->ia_type); - - local->self_heal.buf[child_index] = *buf; - local->self_heal.parentbuf = *postparent; - } else { - gf_log (this->name, GF_LOG_TRACE, - "path %s on subvolume %s => -1 (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - local->self_heal.child_errno[child_index] = op_errno; - } + local = frame->local; + sh = &local->self_heal; + priv = this->private; - } - UNLOCK (&frame->lock); + if (op_ret < 0) + goto out; + enoent_count = afr_errno_count (NULL, sh->child_errno, + priv->child_count, ENOENT); + if (enoent_count > 0) { + gf_log (this->name, GF_LOG_INFO, "Parent dir missing for %s," + " in missing entry self-heal, aborting missing-entry " + "self-heal", + local->loc.path); + afr_sh_missing_entries_finish (frame, this); + return; + } - call_count = afr_frame_return (frame); + nsources = afr_build_sources (this, sh->xattr, sh->buf, + sh->pending_matrix, sh->sources, + sh->success_children, + AFR_ENTRY_TRANSACTION, &subvol_status, + _gf_true); + if ((subvol_status & ALL_FOOLS) || + (subvol_status & SPLIT_BRAIN)) { + gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative " + "merge", sh->parent_loc.path); + afr_mark_success_children_sources (sh->sources, + sh->success_children, + priv->child_count); + } else if (nsources < 0) { + gf_log (this->name, GF_LOG_ERROR, "No sources for dir " + "of %s, in missing entry self-heal, aborting " + "self-heal", local->loc.path); + op_errno = EIO; + goto out; + } - if (call_count == 0) { - sh_missing_entries_create (frame, this); - } + source = afr_sh_select_source (sh->sources, priv->child_count); + if (source == -1) { + GF_ASSERT (0); + gf_log (this->name, GF_LOG_DEBUG, "No active sources found."); + op_errno = EIO; + goto out; + } + afr_get_fresh_children (sh->success_children, sh->sources, + sh->fresh_parent_dirs, priv->child_count); + afr_sh_common_lookup (frame, this, &local->loc, + afr_sh_children_lookup_done, NULL, 0, + NULL); + return; - return 0; +out: + afr_sh_set_error (sh, op_errno); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_missing_entries_finish (frame, this); + return; } +void +afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count) +{ + int i = 0; -static int -sh_missing_entries_lookup (call_frame_t *frame, xlator_t *this) + for (i = 0; i < child_count; i++) { + memset (&sh->buf[i], 0, sizeof (sh->buf[i])); + memset (&sh->parentbufs[i], 0, sizeof (sh->parentbufs[i])); + sh->child_errno[i] = 0; + } + memset (&sh->parentbuf, 0, sizeof (sh->parentbuf)); + sh->success_count = 0; + afr_reset_children (sh->success_children, child_count); + afr_reset_children (sh->fresh_children, child_count); + afr_reset_xattr (sh->xattr, child_count); + loc_wipe (&sh->lookup_loc); +} + +/* afr self-heal state will be lost if this call is made + * please check the afr_sh_common_reset that is called in this function + */ +int +afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, + afr_lookup_done_cbk_t lookup_done , uuid_t gfid, + int32_t flags, dict_t *xdata) { - afr_local_t *local = NULL; - int i = 0; - int call_count = 0; - afr_private_t *priv = NULL; - dict_t *xattr_req = NULL; - int ret = -1; + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_private_t *priv = NULL; + dict_t *xattr_req = NULL; + afr_self_heal_t *sh = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; + sh = &local->self_heal; - call_count = afr_up_children_count (priv->child_count, - local->child_up); + call_count = afr_up_children_count (local->child_up, priv->child_count); - local->call_count = call_count; + local->call_count = call_count; - xattr_req = dict_new(); + xattr_req = dict_new(); - if (xattr_req) { - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_uint64 (xattr_req, - priv->pending_key[i], - 3 * sizeof(int32_t)); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); + if (xattr_req) { + afr_xattr_req_prepare (this, xattr_req, loc->path); + if (gfid) { + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s with gfid: %s", + loc->path, uuid_utoa (gfid)); + GF_ASSERT (!uuid_is_null (gfid)); + afr_set_dict_gfid (xattr_req, gfid); } } - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, - sh_missing_entries_lookup_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - &local->loc, xattr_req); - - if (!--call_count) - break; - } - } + afr_sh_common_reset (sh, priv->child_count); + sh->lookup_done = lookup_done; + loc_copy (&sh->lookup_loc, loc); + sh->lookup_flags = flags; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s on subvolume %s", + loc->path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, + afr_sh_common_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + loc, xattr_req); + + if (!--call_count) + break; + } + } - if (xattr_req) - dict_unref (xattr_req); + if (xattr_req) + dict_unref (xattr_req); - return 0; + return 0; } int -afr_sh_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) +afr_sh_post_nb_entrylk_missing_entry_sh_cbk (call_frame_t *frame, + xlator_t *this) { afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; local = frame->local; int_lock = &local->internal_lock; + sh = &local->self_heal; if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_INFO, "Non blocking entrylks failed."); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_missing_entries_done (frame, this); } else { gf_log (this->name, GF_LOG_DEBUG, "Non blocking entrylks done. Proceeding to FOP"); - sh_missing_entries_lookup (frame, this); + afr_sh_common_lookup (frame, this, &sh->parent_loc, + afr_sh_find_fresh_parents, + NULL, AFR_LOOKUP_FAIL_CONFLICTS, + NULL); } return 0; } -static int -afr_sh_entrylk (call_frame_t *frame, xlator_t *this) +int +afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, + char *base_name, afr_lock_cbk_t lock_cbk) { afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + priv = this->private; local = frame->local; int_lock = &local->internal_lock; - sh = &local->self_heal; int_lock->transaction_lk_type = AFR_SELFHEAL_LK; int_lock->selfheal_lk_type = AFR_ENTRY_SELF_HEAL_LK; afr_set_lock_number (frame, this); - int_lock->lk_basename = local->loc.name; - int_lock->lk_loc = &sh->parent_loc; - int_lock->lock_cbk = afr_sh_post_nonblocking_entrylk_cbk; + int_lock->lk_basename = base_name; + int_lock->lk_loc = loc; + int_lock->lock_cbk = lock_cbk; + int_lock->domain = this->name; + int_lock->lockee_count = 0; + afr_init_entry_lockee (&int_lock->lockee[0], local, loc, + base_name, priv->child_count); + int_lock->lockee_count++; afr_nonblocking_entrylk (frame, this); return 0; } static int -afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) +afr_self_heal_parent_entrylk (call_frame_t *frame, xlator_t *this, + afr_lock_cbk_t lock_cbk) { + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; + int ret = -1; + int32_t op_errno = 0; - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - priv = this->private; + local = frame->local; + sh = &local->self_heal; - gf_log (this->name, GF_LOG_TRACE, - "attempting to recreate missing entries for path=%s", - local->loc.path); + gf_log (this->name, GF_LOG_TRACE, + "attempting to recreate missing entries for path=%s", + local->loc.path); - afr_build_parent_loc (&sh->parent_loc, &local->loc); + ret = afr_build_parent_loc (&sh->parent_loc, &local->loc, &op_errno); + if (ret) + goto out; - afr_sh_entrylk (frame, this); - return 0; + afr_sh_entrylk (frame, this, &sh->parent_loc, NULL, + lock_cbk); + return 0; +out: + int_lock = &local->internal_lock; + int_lock->lock_op_ret = -1; + lock_cbk (frame, this); + return 0; } -afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) +static int +afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *lc = NULL; + afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; - afr_self_heal_t *shc = NULL; + local = frame->local; + sh = &local->self_heal; + + sh->sh_type_in_action = AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY; + + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); + + afr_self_heal_parent_entrylk (frame, this, + afr_sh_post_nb_entrylk_missing_entry_sh_cbk); + return 0; +} + +afr_local_t* +afr_self_heal_local_init (afr_local_t *l, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *lc = NULL; + afr_self_heal_t *sh = NULL; + afr_self_heal_t *shc = NULL; + int ret = 0; priv = this->private; sh = &l->self_heal; - lc = GF_CALLOC (1, sizeof (afr_local_t), - gf_afr_mt_afr_local_t); + lc = mem_get0 (this->local_pool); + if (!lc) + goto out; shc = &lc->self_heal; shc->unwind = sh->unwind; - shc->need_data_self_heal = sh->need_data_self_heal; - shc->need_metadata_self_heal = sh->need_metadata_self_heal; - shc->need_entry_self_heal = sh->need_entry_self_heal; + shc->gfid_sh_success_cbk = sh->gfid_sh_success_cbk; + shc->do_missing_entry_self_heal = sh->do_missing_entry_self_heal; + shc->do_gfid_self_heal = sh->do_gfid_self_heal; + shc->do_data_self_heal = sh->do_data_self_heal; + shc->do_metadata_self_heal = sh->do_metadata_self_heal; + shc->do_entry_self_heal = sh->do_entry_self_heal; + shc->force_confirm_spb = sh->force_confirm_spb; shc->forced_merge = sh->forced_merge; - shc->healing_fd_opened = sh->healing_fd_opened; - shc->data_lock_held = sh->data_lock_held; - if (sh->healing_fd && !sh->healing_fd_opened) - shc->healing_fd = fd_ref (sh->healing_fd); - else - shc->healing_fd = sh->healing_fd; shc->background = sh->background; shc->type = sh->type; + shc->data_sh_info = ""; + shc->metadata_sh_info = ""; + + uuid_copy (shc->sh_gfid_req, sh->sh_gfid_req); + if (l->loc.path) { + ret = loc_copy (&lc->loc, &l->loc); + if (ret < 0) + goto out; + } - if (l->loc.path) - loc_copy (&lc->loc, &l->loc); + lc->child_up = memdup (l->child_up, + sizeof (*lc->child_up) * priv->child_count); + if (!lc->child_up) { + ret = -1; + goto out; + } - lc->child_up = memdup (l->child_up, priv->child_count); if (l->xattr_req) lc->xattr_req = dict_ref (l->xattr_req); @@ -1464,69 +2245,67 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode); if (l->cont.lookup.xattr) lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr); - if (l->internal_lock.inode_locked_nodes) - lc->internal_lock.inode_locked_nodes = - memdup (l->internal_lock.inode_locked_nodes, - priv->child_count); - else - lc->internal_lock.inode_locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.inode_locked_nodes), - priv->child_count, - gf_afr_mt_char); - if (l->internal_lock.entry_locked_nodes) - lc->internal_lock.entry_locked_nodes = - memdup (l->internal_lock.entry_locked_nodes, - priv->child_count); - else - lc->internal_lock.entry_locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.entry_locked_nodes), - priv->child_count, - gf_afr_mt_char); - if (l->internal_lock.locked_nodes) - lc->internal_lock.locked_nodes = - memdup (l->internal_lock.locked_nodes, - priv->child_count); - else - lc->internal_lock.locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), - priv->child_count, - gf_afr_mt_char); - lc->internal_lock.inodelk_lock_count = - l->internal_lock.inodelk_lock_count; - lc->internal_lock.entrylk_lock_count = - l->internal_lock.entrylk_lock_count; + lc->internal_lock.locked_nodes = + GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), + priv->child_count, gf_afr_mt_char); + if (!lc->internal_lock.locked_nodes) { + ret = -1; + goto out; + } + + ret = afr_inodelk_init (&lc->internal_lock.inodelk[0], + this->name, priv->child_count); + if (ret) + goto out; +out: + if (ret) { + afr_local_cleanup (lc, this); + lc = NULL; + } return lc; } - int afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) { afr_private_t * priv = NULL; - afr_local_t * local = NULL; + afr_local_t * local = NULL; afr_self_heal_t * sh = NULL; + afr_local_t * orig_frame_local = NULL; + afr_self_heal_t * orig_frame_sh = NULL; char sh_type_str[256] = {0,}; + gf_loglevel_t loglevel = 0; priv = this->private; - local = bgsh_frame->local; + local = bgsh_frame->local; sh = &local->self_heal; - if (local->govinda_gOvinda) { - afr_set_split_brain (this, local->cont.lookup.inode, _gf_true); - } else { - afr_set_split_brain (this, local->cont.lookup.inode, _gf_false); - } + if (local->unhealable) { + afr_set_split_brain (this, sh->inode, SPB, SPB); + } - afr_self_heal_type_str_get(sh, sh_type_str, - sizeof(sh_type_str)); - gf_log (this->name, GF_LOG_NORMAL, - "background %s self-heal completed on %s", sh_type_str, - local->loc.path); + afr_self_heal_type_str_get (sh, sh_type_str, + sizeof(sh_type_str)); + if (is_self_heal_failed (sh, AFR_CHECK_ALL) && !priv->shd.iamshd) { + loglevel = GF_LOG_ERROR; + } else if (!is_self_heal_failed (sh, AFR_CHECK_ALL)) { + loglevel = GF_LOG_INFO; + } else { + loglevel = GF_LOG_DEBUG; + } + + afr_log_self_heal_completion_status (local, loglevel); - if (!sh->unwound) { - sh->unwind (sh->orig_frame, this); + FRAME_SU_UNDO (bgsh_frame, afr_local_t); + + if (!sh->unwound && sh->unwind) { + orig_frame_local = sh->orig_frame->local; + orig_frame_sh = &orig_frame_local->self_heal; + orig_frame_sh->actual_sh_started = _gf_true; + sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, + is_self_heal_failed (sh, AFR_CHECK_ALL)); } if (sh->background) { @@ -1539,119 +2318,495 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) AFR_STACK_DESTROY (bgsh_frame); - return 0; + return 0; } int -afr_self_heal (call_frame_t *frame, xlator_t *this) +afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int32_t op_errno = 0; + int ret = 0; + afr_self_heal_t *orig_sh = NULL; + call_frame_t *sh_frame = NULL; + afr_local_t *sh_local = NULL; + loc_t *loc = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t *sh_local = NULL; + local = frame->local; + orig_sh = &local->self_heal; + priv = this->private; + + GF_ASSERT (local->loc.path); + + gf_log (this->name, GF_LOG_TRACE, + "performing self heal on %s (metadata=%d data=%d entry=%d)", + local->loc.path, + local->self_heal.do_metadata_self_heal, + local->self_heal.do_data_self_heal, + local->self_heal.do_entry_self_heal); + + op_errno = ENOMEM; + sh_frame = copy_frame (frame); + if (!sh_frame) + goto out; + afr_set_lk_owner (sh_frame, this, sh_frame->root); + afr_set_low_priority (sh_frame); + + sh_local = afr_self_heal_local_init (local, this); + if (!sh_local) + goto out; + sh_frame->local = sh_local; + sh = &sh_local->self_heal; + + sh->inode = inode_ref (inode); + sh->orig_frame = frame; + sh->completion_cbk = afr_self_heal_completion_cbk; - local = frame->local; - priv = this->private; + sh->success = GF_CALLOC (priv->child_count, sizeof (*sh->success), + gf_afr_mt_char); + if (!sh->success) + goto out; + sh->sources = GF_CALLOC (sizeof (*sh->sources), priv->child_count, + gf_afr_mt_int); + if (!sh->sources) + goto out; + sh->locked_nodes = GF_CALLOC (sizeof (*sh->locked_nodes), + priv->child_count, + gf_afr_mt_int); + if (!sh->locked_nodes) + goto out; - afr_set_lk_owner (frame, this); + sh->pending_matrix = afr_matrix_create (priv->child_count, + priv->child_count); + if (!sh->pending_matrix) + goto out; + + sh->delta_matrix = afr_matrix_create (priv->child_count, + priv->child_count); + if (!sh->delta_matrix) + goto out; + + sh->fresh_parent_dirs = afr_children_create (priv->child_count); + if (!sh->fresh_parent_dirs) + goto out; + ret = afr_sh_common_create (sh, priv->child_count); + if (ret) { + op_errno = -ret; + goto out; + } if (local->self_heal.background) { LOCK (&priv->lock); { if (priv->background_self_heals_started - > priv->background_self_heal_count) { + < priv->background_self_heal_count) { + priv->background_self_heals_started++; - local->self_heal.background = _gf_false; } else { - priv->background_self_heals_started++; + local->self_heal.background = _gf_false; + sh->background = _gf_false; } } UNLOCK (&priv->lock); } - gf_log (this->name, GF_LOG_TRACE, - "performing self heal on %s (metadata=%d data=%d entry=%d)", - local->loc.path, - local->self_heal.need_metadata_self_heal, - local->self_heal.need_data_self_heal, - local->self_heal.need_entry_self_heal); + if (!local->loc.parent) { + sh->do_missing_entry_self_heal = _gf_false; + sh->do_gfid_self_heal = _gf_false; + } - sh_frame = copy_frame (frame); - sh_local = afr_local_copy (local, this); - sh_frame->local = sh_local; - sh = &sh_local->self_heal; + sh->sh_type_in_action = AFR_SELF_HEAL_INVALID; - sh->orig_frame = frame; + FRAME_SU_DO (sh_frame, afr_local_t); + if (sh->do_missing_entry_self_heal || sh->do_gfid_self_heal) { + afr_self_heal_missing_entries (sh_frame, this); + } else { + loc = &sh_local->loc; + if (uuid_is_null (loc->inode->gfid) && uuid_is_null (loc->gfid)) { + if (!uuid_is_null (inode->gfid)) + GF_ASSERT (!uuid_compare (inode->gfid, + sh->sh_gfid_req)); + uuid_copy (loc->gfid, sh->sh_gfid_req); + } + gf_log (this->name, GF_LOG_TRACE, + "proceeding to metadata check on %s", + local->loc.path); - sh->completion_cbk = afr_self_heal_completion_cbk; + afr_sh_missing_entries_done (sh_frame, this); + } + op_errno = 0; +out: + if (op_errno) { + orig_sh->unwind (frame, this, -1, op_errno, 1); + if (sh_frame) + AFR_STACK_DESTROY (sh_frame); + } + return 0; +} - sh->buf = GF_CALLOC (priv->child_count, sizeof (struct iatt), - gf_afr_mt_iatt); - sh->child_errno = GF_CALLOC (priv->child_count, sizeof (int), - gf_afr_mt_int); - sh->success = GF_CALLOC (priv->child_count, sizeof (int), - gf_afr_mt_int); - sh->xattr = GF_CALLOC (priv->child_count, sizeof (dict_t *), - gf_afr_mt_dict_t); - sh->sources = GF_CALLOC (sizeof (*sh->sources), priv->child_count, - gf_afr_mt_int); - sh->locked_nodes = GF_CALLOC (sizeof (*sh->locked_nodes), - priv->child_count, - gf_afr_mt_int); +void +afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, + size_t size) +{ + GF_ASSERT (str && (size > strlen (" missing-entry gfid " + "meta-data data entry"))); - sh->pending_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count, - gf_afr_mt_int32_t); + if (self_heal_p->do_metadata_self_heal) { + snprintf (str, size, " meta-data"); + } - for (i = 0; i < priv->child_count; i++) { - sh->pending_matrix[i] = GF_CALLOC (sizeof (int32_t), - priv->child_count, - gf_afr_mt_int32_t); - } + if (self_heal_p->do_data_self_heal) { + snprintf (str + strlen(str), size - strlen(str), " data"); + } - sh->delta_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count, - gf_afr_mt_int32_t); - for (i = 0; i < priv->child_count; i++) { - sh->delta_matrix[i] = GF_CALLOC (sizeof (int32_t), - priv->child_count, - gf_afr_mt_int32_t); - } + if (self_heal_p->do_entry_self_heal) { + snprintf (str + strlen(str), size - strlen(str), " entry"); + } - if (local->success_count && local->enoent_count) { - afr_self_heal_missing_entries (sh_frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to metadata check on %s", - local->loc.path); + if (self_heal_p->do_missing_entry_self_heal) { + snprintf (str + strlen(str), size - strlen(str), + " missing-entry"); + } - afr_sh_missing_entries_done (sh_frame, this); - } + if (self_heal_p->do_gfid_self_heal) { + snprintf (str + strlen(str), size - strlen(str), " gfid"); + } +} - return 0; +afr_self_heal_type +afr_self_heal_type_for_transaction (afr_transaction_type type) +{ + afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; + + switch (type) { + case AFR_DATA_TRANSACTION: + sh_type = AFR_SELF_HEAL_DATA; + break; + case AFR_METADATA_TRANSACTION: + sh_type = AFR_SELF_HEAL_METADATA; + break; + case AFR_ENTRY_TRANSACTION: + sh_type = AFR_SELF_HEAL_ENTRY; + break; + case AFR_ENTRY_RENAME_TRANSACTION: + GF_ASSERT (0); + break; + } + return sh_type; +} + +int +afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +{ + int ret = -1; + uuid_t pargfid = {0}; + + if (!child) + goto out; + + if (!uuid_is_null (parent->inode->gfid)) + uuid_copy (pargfid, parent->inode->gfid); + else if (!uuid_is_null (parent->gfid)) + uuid_copy (pargfid, parent->gfid); + + if (uuid_is_null (pargfid)) + goto out; + + if (strcmp (parent->path, "/") == 0) + ret = gf_asprintf ((char **)&child->path, "/%s", name); + else + ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path, + name); + + if (-1 == ret) { + gf_log (this->name, GF_LOG_ERROR, + "asprintf failed while setting child path"); + } + + child->name = strrchr (child->path, '/'); + if (child->name) + child->name++; + + child->parent = inode_ref (parent->inode); + child->inode = inode_new (parent->inode->table); + uuid_copy (child->pargfid, pargfid); + + if (!child->inode) { + ret = -1; + goto out; + } + + ret = 0; +out: + if ((ret == -1) && child) + loc_wipe (child); + + return ret; +} + +int +afr_sh_erase_pending (call_frame_t *frame, xlator_t *this, + afr_transaction_type type, afr_fxattrop_cbk_t cbk, + int (*finish)(call_frame_t *frame, xlator_t *this)) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t **erase_xattr = NULL; + int ret = -1; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, + sh->success, priv->child_count, type); + + erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, + gf_afr_mt_dict_t); + if (!erase_xattr) + goto out; + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) { + call_count++; + erase_xattr[i] = dict_new (); + if (!erase_xattr[i]) + goto out; + } + } + + afr_sh_delta_to_xattr (this, sh->delta_matrix, erase_xattr, + priv->child_count, type); + + gf_log (this->name, GF_LOG_DEBUG, "Delta matrix for: %s", + lkowner_utoa (&frame->root->lk_owner)); + afr_sh_print_pending_matrix (sh->delta_matrix, this); + local->call_count = call_count; + if (call_count == 0) { + ret = 0; + finish (frame, this); + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (!erase_xattr[i]) + continue; + + if (sh->healing_fd) {//true for ENTRY, reg file DATA transaction + STACK_WIND_COOKIE (frame, cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + sh->healing_fd, + GF_XATTROP_ADD_ARRAY, erase_xattr[i], + NULL); + } else { + STACK_WIND_COOKIE (frame, cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, erase_xattr[i], + NULL); + } + } + + ret = 0; +out: + if (erase_xattr) { + for (i = 0; i < priv->child_count; i++) { + if (erase_xattr[i]) { + dict_unref (erase_xattr[i]); + } + } + } + + GF_FREE (erase_xattr); + + if (ret < 0) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + finish (frame, this); + } + + return 0; } void -afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, - size_t size) +afr_set_self_heal_status(afr_self_heal_t *sh, afr_self_heal_status status) +{ + xlator_t *this = NULL; + afr_sh_status_for_all_type *sh_status = &(sh->afr_all_sh_status); + afr_self_heal_type sh_type_in_action = sh->sh_type_in_action; + this = THIS; + + if (!sh) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal" + "Structure"); + goto out; + } + + switch (sh_type_in_action) { + case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: + sh_status->gfid_or_missing_entry_self_heal = status; + break; + case AFR_SELF_HEAL_METADATA: + sh_status->metadata_self_heal = status; + break; + case AFR_SELF_HEAL_DATA: + sh_status->data_self_heal = status; + break; + case AFR_SELF_HEAL_ENTRY: + sh_status->entry_self_heal = status; + break; + case AFR_SELF_HEAL_INVALID: + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid" + "self heal type in action"); + break; + } +out: + return; +} + +void +afr_set_local_for_unhealable (afr_local_t *local) +{ + afr_self_heal_t *sh = NULL; + + sh = &local->self_heal; + + local->unhealable = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +} + +int +is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type) { - assert(str && (size > 0)); + afr_sh_status_for_all_type sh_status = sh->afr_all_sh_status; + afr_self_heal_type sh_type_in_action = AFR_SELF_HEAL_INVALID; + afr_self_heal_status status = AFR_SELF_HEAL_FAILED; + xlator_t *this = NULL; + int sh_failed = 0; + + this = THIS; + + if (!sh) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal " + "structure"); + sh_failed = 1; + goto out; + } + + if (type == AFR_CHECK_ALL) { + if ((sh_status.gfid_or_missing_entry_self_heal == AFR_SELF_HEAL_FAILED) + || (sh_status.metadata_self_heal == AFR_SELF_HEAL_FAILED) + || (sh_status.data_self_heal == AFR_SELF_HEAL_FAILED) + || (sh_status.entry_self_heal == AFR_SELF_HEAL_FAILED)) + sh_failed = 1; + } else if (type == AFR_CHECK_SPECIFIC) { + sh_type_in_action = sh->sh_type_in_action; + switch (sh_type_in_action) { + case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: + status = sh_status.gfid_or_missing_entry_self_heal; + break; + case AFR_SELF_HEAL_METADATA: + status = sh_status.metadata_self_heal; + break; + case AFR_SELF_HEAL_ENTRY: + status = sh_status.entry_self_heal; + break; + case AFR_SELF_HEAL_DATA: + status = sh_status.data_self_heal; + break; + case AFR_SELF_HEAL_INVALID: + status = AFR_SELF_HEAL_NOT_ATTEMPTED; + break; + } + if (status == AFR_SELF_HEAL_FAILED) + sh_failed = 1; - if (self_heal_p->need_metadata_self_heal) { - snprintf(str, size, " meta-data"); } - if (self_heal_p->need_data_self_heal) { - snprintf(str + strlen(str), size - strlen(str), - " data"); +out: + return sh_failed; +} + +char * +get_sh_completion_status (afr_self_heal_status status) +{ + + char *not_attempted = " is not attempted"; + char *failed = " failed"; + char *started = " is started"; + char *sync_begin = " is successfully completed"; + char *result = " has unknown status"; + + switch (status) + { + case AFR_SELF_HEAL_NOT_ATTEMPTED: + result = not_attempted; + break; + case AFR_SELF_HEAL_FAILED: + result = failed; + break; + case AFR_SELF_HEAL_STARTED: + result = started; + break; + case AFR_SELF_HEAL_SYNC_BEGIN: + result = sync_begin; + break; } - if (self_heal_p->need_entry_self_heal) { - snprintf(str + strlen(str), size - strlen(str), - " entry"); + return result; + +} + +void +afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t loglvl) +{ + + char sh_log[4096] = {0}; + afr_self_heal_t *sh = &local->self_heal; + afr_sh_status_for_all_type all_status = sh->afr_all_sh_status; + xlator_t *this = NULL; + size_t off = 0; + int data_sh = 0; + int metadata_sh = 0; + int print_log = 0; + + this = THIS; + + ADD_FMT_STRING (sh_log, off, "gfid or missing entry", + all_status.gfid_or_missing_entry_self_heal, print_log); + ADD_FMT_STRING_SYNC (sh_log, off, "metadata", + all_status.metadata_self_heal, print_log); + if (sh->background) { + ADD_FMT_STRING_SYNC (sh_log, off, "backgroung data", + all_status.data_self_heal, print_log); + } else { + ADD_FMT_STRING_SYNC (sh_log, off, "foreground data", + all_status.data_self_heal, print_log); } + ADD_FMT_STRING_SYNC (sh_log, off, "entry", all_status.entry_self_heal, + print_log); + + if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.data_self_heal && + strcmp (sh->data_sh_info, "") && sh->data_sh_info ) + data_sh = 1; + if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.metadata_self_heal && + strcmp (sh->metadata_sh_info, "") && sh->metadata_sh_info) + metadata_sh = 1; + + if (!print_log) + return; + + gf_log (this->name, loglvl, "%s %s %s on %s", sh_log, + ((data_sh == 1) ? sh->data_sh_info : ""), + ((metadata_sh == 1) ? sh->metadata_sh_info : ""), + local->loc.path); } diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h index ecec8a5ae..473264776 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ b/xlators/cluster/afr/src/afr-self-heal-common.h @@ -1,73 +1,144 @@ /* - Copyright (c) 2008-2009 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __AFR_SELF_HEAL_COMMON_H__ #define __AFR_SELF_HEAL_COMMON_H__ #define FILE_HAS_HOLES(buf) (((buf)->ia_size) > ((buf)->ia_blocks * 512)) +#define AFR_SH_MIN_PARTICIPANTS 2 typedef enum { - AFR_SELF_HEAL_ENTRY, - AFR_SELF_HEAL_METADATA, - AFR_SELF_HEAL_DATA, -} afr_self_heal_type; + AFR_LOOKUP_FAIL_CONFLICTS = 1, + AFR_LOOKUP_FAIL_MISSING_GFIDS = 2, +} afr_lookup_flags_t; int afr_sh_select_source (int sources[], int child_count); int -afr_sh_sink_count (int sources[], int child_count); - -int afr_sh_source_count (int sources[], int child_count); -int -afr_sh_supress_errenous_children (int sources[], int child_errno[], - int child_count); - void afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this); void -afr_sh_build_pending_matrix (afr_private_t *priv, - int32_t *pending_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type); +afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, + const char *loc); + +int +afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, + unsigned char *ignorant_subvols, + dict_t *xattr[], afr_transaction_type type, + size_t child_count); void afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, - int32_t *delta_matrix[], int success[], + int32_t *delta_matrix[], unsigned char success[], int child_count, afr_transaction_type type); int -afr_sh_mark_sources (afr_self_heal_t *sh, int child_count, - afr_self_heal_type type); +afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, + struct iatt *bufs, afr_self_heal_type type, + int32_t *success_children, int32_t *subvol_status); int -afr_sh_delta_to_xattr (afr_private_t *priv, +afr_sh_delta_to_xattr (xlator_t *this, int32_t *delta_matrix[], dict_t *xattr[], int child_count, afr_transaction_type type); -int -afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count); - void afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, size_t size); +afr_self_heal_type +afr_self_heal_type_for_transaction (afr_transaction_type type); + +int +afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, + int32_t **pending_matrix, int32_t *sources, + int32_t *success_children, afr_transaction_type type, + int32_t *subvol_status, gf_boolean_t ignore_ignorant); +void +afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count); + +void +afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *buf, + dict_t *xattr, struct iatt *postparent, + loc_t *loc); + +int +afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, + afr_lookup_done_cbk_t lookup_cbk, uuid_t uuid, + int32_t flags, dict_t *xdata); +int +afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, + int active_src, struct iatt *buf, + struct iatt *parentbuf); +int +afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, + char *base_name, afr_lock_cbk_t lock_cbk); +int +afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, + int child_index); +int +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, + afr_lock_cbk_t lock_cbk); +afr_local_t * +afr_self_heal_local_init (afr_local_t *l, xlator_t *this); +int +afr_sh_data_lock (call_frame_t *frame, xlator_t *this, + off_t start, off_t len, gf_boolean_t block, char *dom, + afr_lock_cbk_t success_handler, + afr_lock_cbk_t failure_handler); +void +afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno); +void +afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this); +typedef int +(*afr_fxattrop_cbk_t) (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xattr, dict_t *xdata); +int +afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name); +int +afr_impunge_frame_create (call_frame_t *frame, xlator_t *this, + int active_source, call_frame_t **impunge_frame); +void +afr_sh_reset (call_frame_t *frame, xlator_t *this); + +void +afr_children_intersection_get (int32_t *set1, int32_t *set2, + int *intersection, unsigned int child_count); +int +afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children, + struct iatt *bufs); +int +afr_sh_erase_pending (call_frame_t *frame, xlator_t *this, + afr_transaction_type type, afr_fxattrop_cbk_t cbk, + int (*finish)(call_frame_t *frame, xlator_t *this)); + +void +afr_set_local_for_unhealable (afr_local_t *local); + +int +is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type); + +void +afr_set_self_heal_status (afr_self_heal_t *sh, afr_self_heal_status status); + +void +afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t logl); + +char* +afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this); #endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 077738169..9de26ee56 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2009 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -49,421 +40,476 @@ #include "afr-self-heal-common.h" #include "afr-self-heal-algorithm.h" - int -afr_sh_data_done (call_frame_t *frame, xlator_t *this) +afr_sh_data_fail (call_frame_t *frame, xlator_t *this); + +static inline gf_boolean_t +afr_sh_data_proceed (unsigned int success_count) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; + return (success_count >= AFR_SH_MIN_PARTICIPANTS); +} - local = frame->local; - sh = &local->self_heal; - priv = this->private; +extern int +sh_loop_finish (call_frame_t *loop_frame, xlator_t *this); - /* - TODO: cleanup sh->* - */ +int +afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this); - if (sh->healing_fd && !sh->healing_fd_opened) { - /* unref only if we created the fd ourselves */ +int +afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this); - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; - } +int +afr_sh_data_finish (call_frame_t *frame, xlator_t *this); -/* for (i = 0; i < priv->child_count; i++) */ -/* sh->locked_nodes[i] = 0; */ +int +afr_sh_data_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; - gf_log (this->name, GF_LOG_TRACE, - "self heal of %s completed", - local->loc.path); + local = frame->local; + sh = &local->self_heal; - sh->completion_cbk (frame, this); + sh->completion_cbk (frame, this); - return 0; + return 0; } int afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - - int child_index = (long) cookie; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = (long) cookie; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - LOCK (&frame->lock); - { + LOCK (&frame->lock); + { if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "flush or setattr failed on %s on subvolume %s: %s", + gf_log (this->name, GF_LOG_ERROR, + "flush failed on %s on subvolume %s: %s", local->loc.path, priv->children[child_index]->name, strerror (op_errno)); } - } - UNLOCK (&frame->lock); + } + UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); + call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_sh_data_done (frame, this); - } + if (call_count == 0) { + afr_sh_data_done (frame, this); + } - return 0; + return 0; } - int -afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *statpre, struct iatt *statpost) +afr_sh_data_close (call_frame_t *frame, xlator_t *this) { - afr_sh_data_flush_cbk (frame, cookie, this, op_ret, op_errno); + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + int i = 0; + int call_count = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + if (!sh->healing_fd) { + //This happens when file is non-reg + afr_sh_data_done (frame, this); + return 0; + } + call_count = afr_set_elem_count_get (sh->success, + priv->child_count); + local->call_count = call_count; + + if (call_count == 0) { + afr_sh_data_done (frame, this); + return 0; + } + + for (i = 0; i < priv->child_count; i++) { + if (!sh->success[i]) + continue; + gf_log (this->name, GF_LOG_DEBUG, + "closing fd of %s on %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->flush, + sh->healing_fd, NULL); + + if (!--call_count) + break; + } return 0; } - int -afr_sh_data_close (call_frame_t *frame, xlator_t *this) +afr_sh_dom_unlock (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - - int i = 0; - int call_count = 0; - int source = 0; - int32_t valid = 0; - - struct iatt stbuf = {0,}; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; local = frame->local; sh = &local->self_heal; priv = this->private; - source = sh->source; + if (sh->sh_dom_lock_held) + afr_sh_data_unlock (frame, this, priv->sh_domain, + afr_sh_data_close); + else + afr_sh_data_close (frame, this); + return 0; +} - valid |= (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME); +int +afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ - stbuf.ia_atime = sh->buf[source].ia_atime; - stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec; - stbuf.ia_mtime = sh->buf[source].ia_mtime; - stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = (long) cookie; - if (sh->healing_fd_opened) { - /* not our job to close the fd */ + local = frame->local; + priv = this->private; - afr_sh_data_done (frame, this); - return 0; + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_INFO, + "setattr failed on %s on subvolume %s: %s", + local->loc.path, priv->children[child_index]->name, + strerror (op_errno)); + } } + UNLOCK (&frame->lock); - if (!sh->healing_fd) { - afr_sh_data_done (frame, this); - return 0; - } - - call_count = (sh->active_sinks + 1) * 2; - local->call_count = call_count; + call_count = afr_frame_return (frame); - /* closed source */ - gf_log (this->name, GF_LOG_TRACE, - "closing fd of %s on %s", - local->loc.path, priv->children[sh->source]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, - (void *) (long) sh->source, - priv->children[sh->source], - priv->children[sh->source]->fops->flush, - sh->healing_fd); - call_count--; + if (call_count == 0) { + afr_sh_data_finish (frame, this); + } - STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk, - (void *) (long) sh->source, - priv->children[sh->source], - priv->children[sh->source]->fops->setattr, - &local->loc, &stbuf, valid); + return 0; +} - call_count--; +int +afr_sh_data_setattr (call_frame_t *frame, xlator_t *this, struct iatt* stbuf) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + int i = 0; + int call_count = 0; + int32_t valid = 0; - if (call_count == 0) - return 0; + local = frame->local; + sh = &local->self_heal; + priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] || !local->child_up[i]) - continue; + valid = (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME); - gf_log (this->name, GF_LOG_TRACE, - "closing fd of %s on %s", - local->loc.path, priv->children[i]->name); + call_count = afr_set_elem_count_get (sh->success, + priv->child_count); + local->call_count = call_count; - STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->flush, - sh->healing_fd); + if (call_count == 0) { + GF_ASSERT (0); + afr_sh_data_finish (frame, this); + return 0; + } - call_count--; + for (i = 0; i < priv->child_count; i++) { + if (!sh->success[i]) + continue; STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setattr, - &local->loc, &stbuf, valid); + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->setattr, + &local->loc, stbuf, valid, NULL); - if (!--call_count) - break; - } + if (!--call_count) + break; + } - return 0; + return 0; } - int -afr_sh_data_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +afr_sh_data_setattr_fstat_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct iatt *buf, dict_t *xdata) { - afr_local_t * local = NULL; - int call_count = 0; - int child_index = (long) cookie; - - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "locking inode of %s on child %d failed: %s", - local->loc.path, child_index, - strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_TRACE, - "inode of %s on child %d locked", - local->loc.path, child_index); - } - } - UNLOCK (&frame->lock); + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int child_index = (long) cookie; - call_count = afr_frame_return (frame); + local = frame->local; + sh = &local->self_heal; - if (call_count == 0) { - afr_sh_data_close (frame, this); - } + GF_ASSERT (sh->source == child_index); + if (op_ret != -1) { + sh->buf[child_index] = *buf; + afr_sh_data_setattr (frame, this, buf); + } else { + gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " + "time-stamps after self-heal", local->loc.path); + afr_sh_data_fail (frame, this); + } - return 0; + return 0; } +/* + * If there are any writes after the self-heal is triggered then the + * stbuf stored in local->self_heal.buf[] will be invalid so we do one more + * stat on the source and then set the [am]times + */ +int +afr_sh_set_timestamps (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + STACK_WIND_COOKIE (frame, afr_sh_data_setattr_fstat_cbk, + (void *) (long) sh->source, + priv->children[sh->source], + priv->children[sh->source]->fops->fstat, + sh->healing_fd, NULL); + return 0; +} +//Fun fact, lock_cbk is being used for both lock & unlock int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this) +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, + afr_lock_cbk_t lock_cbk) { afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; - afr_self_heal_t *sh = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int ret = 0; local = frame->local; int_lock = &local->internal_lock; - sh = &local->self_heal; - - GF_ASSERT (!sh->data_lock_held); + sh = &local->self_heal; + priv = this->private; - int_lock->lock_cbk = afr_sh_data_close; + if (strcmp (dom, this->name) == 0) { + sh->data_lock_held = _gf_false; + } else if (strcmp (dom, priv->sh_domain) == 0) { + sh->sh_dom_lock_held = _gf_false; + } else { + ret = -1; + goto out; + } + int_lock->lock_cbk = lock_cbk; + int_lock->domain = dom; afr_unlock (frame, this); - return 0; +out: + if (ret) { + int_lock->lock_op_ret = -1; + int_lock->lock_cbk (frame, this); + } + return 0; } - int afr_sh_data_finish (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; - local = frame->local; - sh = &local->self_heal; + local = frame->local; + sh = &local->self_heal; - gf_log (this->name, GF_LOG_TRACE, - "finishing data selfheal of %s", local->loc.path); + gf_log (this->name, GF_LOG_DEBUG, + "finishing data selfheal of %s", local->loc.path); - if (!sh->data_lock_held) - afr_sh_data_unlock (frame, this); + if (sh->data_lock_held) + afr_sh_data_unlock (frame, this, this->name, afr_sh_dom_unlock); else - afr_sh_data_close (frame, this); + afr_sh_dom_unlock (frame, this); - return 0; + return 0; } - int -afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) +afr_sh_data_fail (call_frame_t *frame, xlator_t *this) { - int call_count = 0; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; - call_count = afr_frame_return (frame); + local = frame->local; + sh = &local->self_heal; - if (call_count == 0) - afr_sh_data_finish (frame, this); + gf_log (this->name, GF_LOG_DEBUG, + "finishing failed data selfheal of %s", local->loc.path); - return 0; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_data_finish (frame, this); + return 0; } - int -afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) +afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xattr, dict_t *xdata) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, sh->success, - priv->child_count, AFR_DATA_TRANSACTION); - - erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, - gf_afr_mt_dict_t); + int call_count = 0; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int32_t child_index = (long) cookie; - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; - - erase_xattr[i] = get_new_dict(); - dict_ref (erase_xattr[i]); - } - } - - afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, - priv->child_count, AFR_DATA_TRANSACTION); - - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "erasing pending flags from %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_erase_pending_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - sh->healing_fd, - GF_XATTROP_ADD_ARRAY, erase_xattr[i]); - if (!--call_count) - break; - } + priv = this->private; + local = frame->local; + sh = &local->self_heal; + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Erasing of pending change " + "log failed on %s for subvol %s, reason: %s", + local->loc.path, priv->children[child_index]->name, + strerror (op_errno)); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + } - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); - } - } - GF_FREE (erase_xattr); + call_count = afr_frame_return (frame); - return 0; + if (call_count == 0) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { + if (sh->old_loop_frame) + sh_loop_finish (sh->old_loop_frame, this); + sh->old_loop_frame = NULL; + afr_sh_data_fail (frame, this); + goto out; + } + if (!IA_ISREG (sh->type)) { + afr_sh_data_finish (frame, this); + goto out; + } + GF_ASSERT (sh->old_loop_frame); + afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, + afr_post_sh_big_lock_success, + afr_post_sh_big_lock_failure); + } +out: + return 0; } - int -afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) +afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - int call_count = 0; - int child_index = 0; - - priv = this->private; - local = frame->local; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) - gf_log (this->name, GF_LOG_DEBUG, - "ftruncate of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - else - gf_log (this->name, GF_LOG_TRACE, - "ftruncate of %s on subvolume %s completed", - local->loc.path, - priv->children[child_index]->name); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_data_erase_pending (frame, this); - } - - return 0; + afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION, + afr_sh_data_erase_pending_cbk, + afr_sh_data_finish); + return 0; } - int -afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) +afr_sh_data_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; - int *sources = NULL; - int call_count = 0; - int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = (long) cookie; + local = frame->local; + priv = this->private; + sh = &local->self_heal; + + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "%s: Failed to fsync on " + "%s - %s", local->loc.path, + priv->children[child_index]->name, strerror (op_errno)); + LOCK (&frame->lock); + { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + } + UNLOCK (&frame->lock); + if (sh->old_loop_frame) + sh_loop_finish (sh->old_loop_frame, this); + sh->old_loop_frame = NULL; + } - priv = this->private; - local = frame->local; - sh = &local->self_heal; + call_count = afr_frame_return (frame); + if (call_count == 0) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) + afr_sh_data_fail (frame, this); + else + afr_sh_data_erase_pending (frame, this); + } + return 0; +} - sources = sh->sources; - call_count = sh->active_sinks; +/* + * Before erasing xattrs, make sure the data is written to disk + */ +int +afr_sh_data_fsync (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + int i = 0; + int call_count = 0; - local->call_count = call_count; + local = frame->local; + priv = this->private; + sh = &local->self_heal; - for (i = 0; i < priv->child_count; i++) { - if (sources[i] || !local->child_up[i]) - continue; + call_count = sh->active_sinks; + if (call_count == 0) { + afr_sh_data_erase_pending (frame, this); + return 0; + } - STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->ftruncate, - sh->healing_fd, sh->file_size); + local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (!sh->success[i] || sh->sources[i]) + continue; - if (!--call_count) - break; - } + STACK_WIND_COOKIE (frame, afr_sh_data_fsync_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->fsync, + sh->healing_fd, 1, NULL); + } - return 0; + return 0; } - static struct afr_sh_algorithm * sh_algo_from_name (xlator_t *this, char *name) { int i = 0; + if (name == NULL) + goto out; + while (afr_self_heal_algorithms[i].name) { if (!strcmp (name, afr_self_heal_algorithms[i].name)) { return &afr_self_heal_algorithms[i]; @@ -472,17 +518,22 @@ sh_algo_from_name (xlator_t *this, char *name) i++; } +out: return NULL; } static int -sh_zero_byte_files_exist (afr_self_heal_t *sh, int child_count) +sh_zero_byte_files_exist (afr_local_t *local, int child_count) { - int i; - int ret = 0; + int i = 0; + int ret = 0; + afr_self_heal_t *sh = NULL; + sh = &local->self_heal; for (i = 0; i < child_count; i++) { + if (!local->child_up[i] || sh->child_errno[i]) + continue; if (sh->buf[i].ia_size == 0) { ret = 1; break; @@ -509,9 +560,9 @@ afr_sh_data_pick_algo (call_frame_t *frame, xlator_t *this) if (algo == NULL) { /* option not set, so fall back on heuristics */ - if ((local->enoent_count != 0) - || sh_zero_byte_files_exist (sh, priv->child_count) - || (sh->file_size <= (priv->data_self_heal_window_size * this->ctx->page_size))) { + if (sh_zero_byte_files_exist (local, priv->child_count) + || (sh->file_size <= (priv->data_self_heal_window_size * + this->ctx->page_size))) { /* * If the file does not exist on one of the subvolumes, @@ -540,398 +591,860 @@ afr_sh_data_pick_algo (call_frame_t *frame, xlator_t *this) int afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int active_sinks = 0; - int source = 0; - int i = 0; - + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; struct afr_sh_algorithm *sh_algo = NULL; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + local = frame->local; + sh = &local->self_heal; - source = sh->source; + sh->algo_completion_cbk = afr_sh_data_fsync; + sh->algo_abort_cbk = afr_sh_data_fail; - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] == 0 && local->child_up[i] == 1) { - active_sinks++; - sh->success[i] = 1; - } - } - sh->success[source] = 1; - - if (active_sinks == 0) { - gf_log (this->name, GF_LOG_TRACE, - "no active sinks for performing self-heal on file %s", - local->loc.path); - afr_sh_data_finish (frame, this); - return 0; - } - sh->active_sinks = active_sinks; + sh_algo = afr_sh_data_pick_algo (frame, this); - gf_log (this->name, GF_LOG_DEBUG, - "self-healing file %s from subvolume %s to %d other", - local->loc.path, priv->children[source]->name, active_sinks); + sh->algo = sh_algo; + sh_algo->fn (frame, this); - sh->algo_completion_cbk = afr_sh_data_trim_sinks; - sh->algo_abort_cbk = afr_sh_data_finish; + return 0; +} - sh_algo = afr_sh_data_pick_algo (frame, this); +int +afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int call_count = 0; + int child_index = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; - sh_algo->fn (frame, this); + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "ftruncate of %s on subvolume %s failed (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "ftruncate of %s on subvolume %s completed", + local->loc.path, + priv->children[child_index]->name); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) + afr_sh_data_fail (frame, this); + else + afr_sh_data_sync_prepare (frame, this); + } - return 0; + return 0; } int -afr_sh_data_fix (call_frame_t *frame, xlator_t *this) +afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_local_t * orig_local = NULL; + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + int *sources = NULL; + int call_count = 0; + int i = 0; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int nsources = 0; - int source = 0; - int i = 0; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + priv = this->private; + local = frame->local; + sh = &local->self_heal; - afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr, - priv->child_count, AFR_DATA_TRANSACTION); + sources = sh->sources; + call_count = sh->active_sinks; - afr_sh_print_pending_matrix (sh->pending_matrix, this); + local->call_count = call_count; - nsources = afr_sh_mark_sources (sh, priv->child_count, - AFR_SELF_HEAL_DATA); + for (i = 0; i < priv->child_count; i++) { + if (sources[i] || !local->child_up[i]) + continue; - afr_sh_supress_errenous_children (sh->sources, sh->child_errno, - priv->child_count); + STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->ftruncate, + sh->healing_fd, sh->file_size, + NULL); - if (nsources == 0) { - gf_log (this->name, GF_LOG_TRACE, - "No self-heal needed for %s", - local->loc.path); + if (!--call_count) + break; + } - afr_sh_data_finish (frame, this); - return 0; + return 0; +} + +int +afr_sh_inode_set_read_ctx (afr_self_heal_t *sh, xlator_t *this) +{ + afr_private_t *priv = NULL; + int ret = 0; + int i = 0; + + priv = this->private; + sh->source = afr_sh_select_source (sh->sources, priv->child_count); + if (sh->source < 0) { + ret = -1; + goto out; } - if ((nsources == -1) - && (priv->favorite_child != -1) - && (sh->child_errno[priv->favorite_child] == 0)) { + /* detect changes not visible through pending flags -- JIC */ + for (i = 0; i < priv->child_count; i++) { + if (i == sh->source || sh->child_errno[i]) + continue; + + if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[sh->source])) + sh->sources[i] = 0; + } - gf_log (this->name, GF_LOG_DEBUG, - "Picking favorite child %s as authentic source to resolve conflicting data of %s", - priv->children[priv->favorite_child]->name, - local->loc.path); + afr_reset_children (sh->fresh_children, priv->child_count); + afr_get_fresh_children (sh->success_children, sh->sources, + sh->fresh_children, priv->child_count); + afr_inode_set_read_ctx (this, sh->inode, sh->source, + sh->fresh_children); +out: + return ret; +} - sh->sources[priv->favorite_child] = 1; +char* +afr_get_sizes_str (afr_local_t *local, struct iatt *bufs, xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = 0; + char num[1024] = {0}; + size_t len = 0; + char *sizes_str = NULL; + size_t off = 0; + char *fmt_str = "%llu bytes on %s, "; + char *child_down = " %s,"; + char *child_unknown = " %s,"; + int down_child_present = 0; + int down_count = 0; + int unknown_count = 0; + int unknown_child_present = 0; + char *down_subvol_1 = " down subvolume is "; + char *unknown_subvol_1 = " unknown subvolume is "; + char *down_subvol_2 = " down subvolumes are "; + char *unknown_subvol_2 = " unknown subvolumes are "; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 1) { + len += snprintf (num, sizeof (num), fmt_str, + (unsigned long long) bufs[i].ia_size, + priv->children[i]->name); + } else if (local->child_up[i] == 0) { + len += snprintf (num, sizeof (num), child_down, + priv->children[i]->name); + if (!down_child_present) + down_child_present = 1; + down_count ++; + } else if (local->child_up[i] == -1) { + len += snprintf (num, sizeof (num), child_unknown, + priv->children[i]->name); + if (!unknown_child_present) + unknown_child_present = 1; + unknown_count++; + } - nsources = afr_sh_source_count (sh->sources, - priv->child_count); - } + } - if (nsources == -1) { - gf_log (this->name, GF_LOG_ERROR, - "Unable to self-heal contents of '%s' (possible split-brain). " - "Please delete the file from all but the preferred " - "subvolume.", local->loc.path); + if (down_child_present) { + if (down_count > 1) + len += snprintf (num, sizeof (num), "%s", + down_subvol_2); + else + len += snprintf (num, sizeof (num), "%s", + down_subvol_1); + } + if (unknown_child_present) { + if (unknown_count > 1) + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_2); + else + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_1); + } - local->govinda_gOvinda = 1; + len++;//for '\0' - afr_sh_data_finish (frame, this); - return 0; - } + sizes_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); - source = afr_sh_select_source (sh->sources, priv->child_count); - - if (source == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "No active sources found."); + if (!sizes_str) + return NULL; - afr_sh_data_finish (frame, this); - return 0; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 1) { + off += snprintf (sizes_str + off, len - off, fmt_str, + (unsigned long long) bufs[i].ia_size, + priv->children[i]->name); + } } - sh->source = source; - sh->block_size = 65536; - sh->file_size = sh->buf[source].ia_size; + if (down_child_present) { + if (down_count > 1) { + off += snprintf (sizes_str + off, len - off, "%s", + down_subvol_2); + } else { + off += snprintf (sizes_str + off, len - off, "%s", + down_subvol_1); + } + } - if (FILE_HAS_HOLES (&sh->buf[source])) - sh->file_has_holes = 1; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 0) { + off += snprintf (sizes_str + off, len - off, child_down, + priv->children[i]->name); + } + } - orig_local = sh->orig_frame->local; - orig_local->cont.lookup.buf.ia_size = sh->buf[source].ia_size; + if (unknown_child_present) { + if (unknown_count > 1) { + off += snprintf (sizes_str + off, len - off, "%s", + unknown_subvol_2); + } else { + off += snprintf (sizes_str + off, len - off, "%s", + unknown_subvol_1); + } + } - /* detect changes not visible through pending flags -- JIC */ - for (i = 0; i < priv->child_count; i++) { - if (i == source || sh->child_errno[i]) - continue; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == -1) { + off += snprintf (sizes_str + off, len - off, + child_unknown, + priv->children[i]->name); - if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[source])) - sh->sources[i] = 0; - } + } + } - afr_set_read_child (this, local->loc.inode, sh->source); + return sizes_str; +} - /* - quick-read might have read the file, so send xattr from - the source subvolume (http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=815) - */ +char* +afr_get_sinks_str (xlator_t *this, afr_local_t *local, afr_self_heal_t *sh) +{ + afr_private_t *priv = NULL; + int i = 0; + char num[1024] = {0}; + size_t len = 0; + char *sinks_str = NULL; + char *temp_str = " to sinks "; + char *str_format = " %s,"; + char off = 0; + + priv = this->private; + + len += snprintf (num, sizeof (num), "%s", temp_str); + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { + len += snprintf (num, sizeof (num), str_format, + priv->children[i]->name); + } + } - dict_unref (orig_local->cont.lookup.xattr); - if (orig_local->cont.lookup.xattrs) - orig_local->cont.lookup.xattr = dict_ref (orig_local->cont.lookup.xattrs[sh->source]); + len ++; - if (sh->background) { - sh->unwind (sh->orig_frame, this); - sh->unwound = _gf_true; + sinks_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); + + if (!sinks_str) + return NULL; + + off += snprintf (sinks_str + off, len - off, "%s", temp_str); + + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { + off += snprintf (sinks_str + off, len - off, + str_format, + priv->children[i]->name); + } } - afr_sh_data_sync_prepare (frame, this); + return sinks_str; - return 0; } +void +afr_set_data_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, xlator_t *this) +{ + char *pending_matrix_str = NULL; + char *sizes_str = NULL; + char *sinks_str = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + + pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix, + this); + if (!pending_matrix_str) + pending_matrix_str = ""; + + sizes_str = afr_get_sizes_str (local, sh->buf, this); + if (!sizes_str) + sizes_str = ""; + + sinks_str = afr_get_sinks_str (this, local, sh); + if (!sinks_str) + sinks_str = ""; + + gf_asprintf (&sh->data_sh_info, " data self heal from %s %s with " + "%s data %s", priv->children[sh->source]->name, sinks_str, + sizes_str, pending_matrix_str); + + if (pending_matrix_str && strcmp (pending_matrix_str, "")) + GF_FREE (pending_matrix_str); + + if (sizes_str && strcmp (sizes_str, "")) + GF_FREE (sizes_str); +} + +void +afr_sh_data_fix (call_frame_t *frame, xlator_t *this) +{ + int source = 0; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + sh->block_size = this->ctx->page_size; + sh->file_size = sh->buf[source].ia_size; + + if (FILE_HAS_HOLES (&sh->buf[source])) + sh->file_has_holes = 1; + + if (sh->background && sh->unwind && !sh->unwound) { + sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, + is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)); + sh->unwound = _gf_true; + } + + afr_sh_mark_source_sinks (frame, this); + if (sh->active_sinks == 0) { + gf_log (this->name, GF_LOG_INFO, + "no active sinks for performing self-heal on file %s", + local->loc.path); + afr_sh_data_finish (frame, this); + return; + } + + gf_log (this->name, GF_LOG_DEBUG, + "self-healing file %s from subvolume %s to %d other", + local->loc.path, priv->children[sh->source]->name, + sh->active_sinks); + + sh->actual_sh_started = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); + afr_sh_data_trim_sinks (frame, this); +} + int -afr_self_heal_get_source (xlator_t *this, afr_local_t *local, dict_t **xattr) +afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this) { - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int nsources = 0; + int ret = 0; + int *old_sources = NULL; + int tstamp_source = 0; + int i = 0; - int source = 0; - int i = 0; + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + gf_log (this->name, GF_LOG_DEBUG, "Pending matrix for: %s", + lkowner_utoa (&frame->root->lk_owner)); + if (sh->sync_done) { + //store sources before sync so that mtime can be set using the + //iatt buf from one of them. + old_sources = alloca (priv->child_count*sizeof (*old_sources)); + memcpy (old_sources, sh->sources, + priv->child_count * sizeof (*old_sources)); + } - sh = &local->self_heal; - priv = this->private; + nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, + sh->sources, sh->success_children, + AFR_DATA_TRANSACTION, NULL, _gf_true); + if ((nsources == -1) + && (priv->favorite_child != -1) + && (sh->child_errno[priv->favorite_child] == 0)) { - sh->pending_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count, - gf_afr_mt_int32_t); - for (i = 0; i < priv->child_count; i++) { - sh->pending_matrix[i] = GF_CALLOC (sizeof (int32_t), - priv->child_count, - gf_afr_mt_int32_t); - } + gf_log (this->name, GF_LOG_DEBUG, + "Picking favorite child %s as authentic source to " + "resolve conflicting data of %s", + priv->children[priv->favorite_child]->name, + local->loc.path); + + sh->sources[priv->favorite_child] = 1; + + nsources = afr_sh_source_count (sh->sources, + priv->child_count); + } + + if (nsources == -1) { + afr_sh_print_split_brain_log (sh->pending_matrix, this, + local->loc.path); + afr_set_split_brain (this, sh->inode, DONT_KNOW, SPB); - sh->sources = GF_CALLOC (priv->child_count, sizeof (*sh->sources), - gf_afr_mt_int32_t); + afr_sh_data_fail (frame, this); + return 0; + } + + afr_set_split_brain (this, sh->inode, DONT_KNOW, NO_SPB); + + ret = afr_sh_inode_set_read_ctx (sh, this); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "No active sources found."); - afr_sh_build_pending_matrix (priv, sh->pending_matrix, xattr, - priv->child_count, AFR_DATA_TRANSACTION); + afr_sh_data_fail (frame, this); + return 0; + } - (void)afr_sh_mark_sources (sh, priv->child_count, AFR_SELF_HEAL_DATA); + if (sh->sync_done) { + /* Perform setattr from one of the old_sources if possible + * Because only they have the correct mtime, the new sources + * (i.e. old sinks) have mtime from last writev in sync. + */ + tstamp_source = sh->source; + for (i = 0; i < priv->child_count; i++) { + if (old_sources[i] && sh->sources[i]) + tstamp_source = i; + } + afr_sh_data_setattr (frame, this, &sh->buf[tstamp_source]); + } else { + afr_set_data_sh_info_str (local, sh, this); + if (nsources == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "No self-heal needed for %s", + local->loc.path); - source = afr_sh_select_source (sh->sources, priv->child_count); + afr_sh_data_finish (frame, this); + return 0; + } - return source; + if (sh->do_data_self_heal && + afr_data_self_heal_enabled (priv->data_self_heal)) + afr_sh_data_fix (frame, this); + else + afr_sh_data_finish (frame, this); + } + return 0; } +int +afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, + dict_t **xattr, + afr_transaction_type txn_type, + uuid_t gfid) +{ + afr_private_t *priv = NULL; + int read_child = -1; + int32_t **pending_matrix = NULL; + int32_t *sources = NULL; + int32_t *success_children = NULL; + struct iatt *bufs = NULL; + int32_t nsources = 0; + int32_t prev_read_child = -1; + int32_t config_read_child = -1; + int32_t subvol_status = 0; + + priv = this->private; + bufs = local->cont.lookup.bufs; + success_children = local->cont.lookup.success_children; + + pending_matrix = local->cont.lookup.pending_matrix; + sources = local->cont.lookup.sources; + memset (sources, 0, sizeof (*sources) * priv->child_count); + + nsources = afr_build_sources (this, xattr, bufs, pending_matrix, + sources, success_children, txn_type, + &subvol_status, _gf_false); + if (subvol_status & SPLIT_BRAIN) { + gf_log (this->name, GF_LOG_DEBUG, "%s: Possible split-brain", + local->loc.path); + switch (txn_type) { + case AFR_DATA_TRANSACTION: + local->cont.lookup.possible_spb = _gf_true; + nsources = 1; + sources[success_children[0]] = 1; + break; + case AFR_ENTRY_TRANSACTION: + read_child = afr_get_no_xattr_dir_read_child (this, + success_children, + bufs); + sources[read_child] = 1; + nsources = 1; + break; + default: + break; + } + } + if (nsources < 0) + goto out; + + prev_read_child = local->read_child_index; + config_read_child = priv->read_child; + read_child = afr_select_read_child_from_policy (success_children, + priv->child_count, + prev_read_child, + config_read_child, + sources, + priv->hash_mode, gfid); +out: + gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d", + read_child); + return read_child; +} int afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf) + struct iatt *buf, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - int call_count = -1; - int child_index = (long) cookie; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = -1; + int child_index = (long) cookie; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + local = frame->local; + sh = &local->self_heal; + priv = this->private; - LOCK (&frame->lock); - { - if (op_ret != -1) { + LOCK (&frame->lock); + { + if (op_ret != -1) { gf_log (this->name, GF_LOG_TRACE, "fstat of %s on %s succeeded", local->loc.path, priv->children[child_index]->name); - sh->buf[child_index] = *buf; - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_data_fix (frame, this); - } - - return 0; + sh->buf[child_index] = *buf; + sh->success_children[sh->success_count] = child_index; + sh->success_count++; + } else { + gf_log (this->name, GF_LOG_ERROR, "%s: fstat failed " + "on %s, reason %s", local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + sh->child_errno[child_index] = op_errno; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + /* Previous versions of glusterfs might have set + * the pending data xattrs which need to be erased + */ + if (!afr_sh_data_proceed (sh->success_count)) { + gf_log (this->name, GF_LOG_ERROR, "inspecting metadata " + "succeeded on < %d children, aborting " + "self-heal for %s", AFR_SH_MIN_PARTICIPANTS, + local->loc.path); + afr_sh_data_fail (frame, this); + goto out; + } + afr_sh_data_fxattrop_fstat_done (frame, this); + } +out: + return 0; } int afr_sh_data_fstat (call_frame_t *frame, xlator_t *this) { - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + int child = 0; + int32_t *fstat_children = NULL; - int call_count = 0; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - call_count = afr_up_children_count (priv->child_count, - local->child_up); - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fstat, - sh->healing_fd); - - if (!--call_count) - break; - } - } + priv = this->private; + local = frame->local; + sh = &local->self_heal; - return 0; + fstat_children = memdup (sh->success_children, + sizeof (*fstat_children) * priv->child_count); + if (!fstat_children) { + afr_sh_data_fail (frame, this); + goto out; + } + call_count = sh->success_count; + local->call_count = call_count; + + memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count); + afr_reset_children (sh->success_children, priv->child_count); + sh->success_count = 0; + for (i = 0; i < priv->child_count; i++) { + child = fstat_children[i]; + if (child == -1) + break; + STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk, + (void *) (long) child, + priv->children[child], + priv->children[child]->fops->fstat, + sh->healing_fd, NULL); + --call_count; + } + GF_ASSERT (!call_count); +out: + GF_FREE (fstat_children); + return 0; } - -int -afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) +void +afr_sh_common_fxattrop_resp_handler (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xattr) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - int call_count = -1; - int child_index = (long) cookie; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int child_index = (long) cookie; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + local = frame->local; + sh = &local->self_heal; + priv = this->private; - LOCK (&frame->lock); - { - if (op_ret != -1) { + LOCK (&frame->lock); + { + if (op_ret != -1) { gf_log (this->name, GF_LOG_TRACE, "fxattrop of %s on %s succeeded", local->loc.path, priv->children[child_index]->name); - sh->xattr[child_index] = dict_ref (xattr); - } - } - UNLOCK (&frame->lock); + sh->xattr[child_index] = dict_ref (xattr); + sh->success_children[sh->success_count] = child_index; + sh->success_count++; + } else { + gf_log (this->name, GF_LOG_ERROR, "fxattrop of %s " + "failed on %s, reason %s", local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + sh->child_errno[child_index] = op_errno; + } + } + UNLOCK (&frame->lock); +} - call_count = afr_frame_return (frame); +int +afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xattr, dict_t *xdata) +{ + int call_count = -1; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; - if (call_count == 0) { - afr_sh_data_fstat (frame, this); - } + local = frame->local; + sh = &local->self_heal; - return 0; + afr_sh_common_fxattrop_resp_handler (frame, cookie, this, op_ret, + op_errno, xattr); + + call_count = afr_frame_return (frame); + if (call_count == 0) { + if (!afr_sh_data_proceed (sh->success_count)) { + gf_log (this->name, GF_LOG_ERROR, "%s, inspecting " + "change log succeeded on < %d children", + local->loc.path, AFR_SH_MIN_PARTICIPANTS); + afr_sh_data_fail (frame, this); + goto out; + } + afr_sh_data_fstat (frame, this); + } +out: + return 0; } int afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this) { - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - dict_t *xattr_req = NULL; + afr_self_heal_t *sh = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t **xattr_req; + int32_t *zero_pending = NULL; + int call_count = 0; + int i = 0; + int ret = 0; + int j; - int32_t zero_pending[3] = {0, 0, 0}; + priv = this->private; + local = frame->local; + sh = &local->self_heal; - int call_count = 0; - int i = 0; - int ret = 0; + call_count = afr_up_children_count (local->child_up, + priv->child_count); - priv = this->private; - local = frame->local; - sh = &local->self_heal; + local->call_count = call_count; - call_count = afr_up_children_count (priv->child_count, - local->child_up); + xattr_req = GF_CALLOC(priv->child_count, sizeof(struct dict_t *), + gf_afr_mt_dict_t); + if (!xattr_req) + goto out; - local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + xattr_req[i] = dict_new(); + if (!xattr_req[i]) { + ret = -1; + goto out; + } + } - xattr_req = dict_new(); - if (xattr_req) { - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_static_bin (xattr_req, priv->pending_key[i], - zero_pending, 3 * sizeof(int32_t)); - if (ret < 0) + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + zero_pending = GF_CALLOC (3, sizeof (*zero_pending), + gf_afr_mt_int32_t); + if (!zero_pending) { + ret = -1; + goto out; + } + ret = dict_set_dynptr (xattr_req[i], priv->pending_key[j], + zero_pending, + 3 * sizeof (*zero_pending)); + if (ret < 0) { gf_log (this->name, GF_LOG_WARNING, "Unable to set dict value"); + goto out; + } else { + zero_pending = NULL; + } + } + } + + afr_reset_xattr (sh->xattr, priv->child_count); + afr_reset_children (sh->success_children, priv->child_count); + memset (sh->child_errno, 0, + sizeof (*sh->child_errno) * priv->child_count); + sh->success_count = 0; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_sh_data_fxattrop_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + sh->healing_fd, GF_XATTROP_ADD_ARRAY, + xattr_req[i], NULL); + + if (!--call_count) + break; } } - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_sh_data_fxattrop_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - sh->healing_fd, GF_XATTROP_ADD_ARRAY, - xattr_req); - - if (!--call_count) - break; - } +out: + if (xattr_req) { + for (i = 0; i < priv->child_count; i++) + if (xattr_req[i]) + dict_unref(xattr_req[i]); + GF_FREE(xattr_req); } - if (xattr_req) - dict_unref (xattr_req); + if (ret) { + GF_FREE (zero_pending); + afr_sh_data_fail (frame, this); + } - return 0; + return 0; } +int +afr_sh_data_big_lock_success (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + sh->data_lock_held = _gf_true; + afr_sh_data_fxattrop (frame, this); + return 0; +} int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this); +afr_sh_dom_lock_success (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + sh->sh_dom_lock_held = _gf_true; + afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, + afr_sh_data_big_lock_success, + afr_sh_data_fail); + return 0; +} int -afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) +afr_sh_data_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; local = frame->local; int_lock = &local->internal_lock; + sh = &local->self_heal; if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Non Blocking inodelks failed."); - afr_sh_data_done (frame, this); + gf_log (this->name, GF_LOG_ERROR, "Blocking data inodelks " + "failed for %s. by %s", + local->loc.path, lkowner_utoa (&frame->root->lk_owner)); + + sh->data_lock_failure_handler (frame, this); } else { - gf_log (this->name, GF_LOG_DEBUG, - "Non Blocking inodelks done. Proceeding to FOP"); - afr_sh_data_fxattrop (frame, this); + gf_log (this->name, GF_LOG_DEBUG, "Blocking data inodelks " + "done for %s by %s. Proceding to self-heal", + local->loc.path, lkowner_utoa (&frame->root->lk_owner)); + + sh->data_lock_success_handler (frame, this); } return 0; } int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this) +afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; @@ -941,173 +1454,301 @@ afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; sh = &local->self_heal; + if (int_lock->lock_op_ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks " + "failed for %s. by %s", + local->loc.path, lkowner_utoa (&frame->root->lk_owner)); + + if (!sh->data_lock_block) { + sh->data_lock_failure_handler(frame, this); + } else { + int_lock->lock_cbk = + afr_sh_data_post_blocking_inodelk_cbk; + afr_blocking_lock (frame, this); + } + } else { + + gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks " + "done for %s by %s. Proceeding to self-heal", + local->loc.path, lkowner_utoa (&frame->root->lk_owner)); + sh->data_lock_success_handler (frame, this); + } + + return 0; +} + +int +afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, char *dom, + off_t start, off_t len) +{ + afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; + afr_local_t *local = NULL; + + local = frame->local; + int_lock = &local->internal_lock; + int_lock->transaction_lk_type = AFR_SELFHEAL_LK; int_lock->selfheal_lk_type = AFR_DATA_SELF_HEAL_LK; afr_set_lock_number (frame, this); - int_lock->lk_flock.l_start = 0; - int_lock->lk_flock.l_len = 0; - int_lock->lk_flock.l_type = F_WRLCK; int_lock->lock_cbk = afr_sh_data_post_nonblocking_inodelk_cbk; - afr_nonblocking_inodelk (frame, this); + int_lock->domain = dom; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + inodelk->flock.l_start = start; + inodelk->flock.l_len = len; + inodelk->flock.l_type = F_WRLCK; + afr_nonblocking_inodelk (frame, this); - return 0; + return 0; } +int +afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + GF_ASSERT (sh->old_loop_frame); + sh_loop_finish (sh->old_loop_frame, this); + sh->old_loop_frame = NULL; + sh->data_lock_held = _gf_true; + sh->sync_done = _gf_true; + afr_sh_data_fxattrop (frame, this); + return 0; +} int -afr_sh_data_lock (call_frame_t *frame, xlator_t *this) +afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t * sh = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + local = frame->local; + sh = &local->self_heal; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + GF_ASSERT (sh->old_loop_frame); + sh_loop_finish (sh->old_loop_frame, this); + sh->old_loop_frame = NULL; + afr_sh_set_timestamps (frame, this); + return 0; +} - if (sh->data_lock_held) { - /* caller has held the lock already, - so skip locking */ - afr_sh_data_fxattrop (frame, this); - return 0; - } +int +afr_sh_data_lock (call_frame_t *frame, xlator_t *this, + off_t start, off_t len, gf_boolean_t block, + char *dom, afr_lock_cbk_t success_handler, + afr_lock_cbk_t failure_handler) +{ + afr_local_t * local = NULL; + afr_self_heal_t * sh = NULL; - return afr_sh_data_lock_rec (frame, this); -} + local = frame->local; + sh = &local->self_heal; + sh->data_lock_success_handler = success_handler; + sh->data_lock_failure_handler = failure_handler; + sh->data_lock_block = block; + return afr_sh_data_lock_rec (frame, this, dom, start, len); +} int afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - /* TODO: some of the open's might fail. - In that case, modify cleanup fn to send flush on those - fd's which are already open */ - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, - "open of %s failed on child %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - sh->op_failed = 1; - } + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = 0; - gf_log (this->name, GF_LOG_TRACE, - "open of %s succeeded on child %s", - local->loc.path, - priv->children[child_index]->name); - } - UNLOCK (&frame->lock); + local = frame->local; + sh = &local->self_heal; + priv = this->private; - call_count = afr_frame_return (frame); + child_index = (long) cookie; - if (call_count == 0) { - if (sh->op_failed) { - afr_sh_data_finish (frame, this); - return 0; - } + /* TODO: some of the open's might fail. + In that case, modify cleanup fn to send flush on those + fd's which are already open */ + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "open of %s failed on child %s (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + } else { + gf_log (this->name, GF_LOG_TRACE, + "open of %s succeeded on child %s", + local->loc.path, + priv->children[child_index]->name); + } + } + UNLOCK (&frame->lock); - gf_log (this->name, GF_LOG_TRACE, - "fd for %s opened, commencing sync", - local->loc.path); + call_count = afr_frame_return (frame); - afr_sh_data_lock (frame, this); - } + if (call_count == 0) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { + afr_sh_data_fail (frame, this); + return 0; + } + + gf_log (this->name, GF_LOG_TRACE, + "fd for %s opened, commencing sync", + local->loc.path); + + afr_sh_data_lock (frame, this, 0, 0, _gf_true, priv->sh_domain, + afr_sh_dom_lock_success, afr_sh_data_fail); + } - return 0; + return 0; } int afr_sh_data_open (call_frame_t *frame, xlator_t *this) { - int i = 0; - int call_count = 0; + int i = 0; + int call_count = 0; + fd_t *fd = NULL; + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; - fd_t *fd = NULL; + call_count = afr_up_children_count (local->child_up, priv->child_count); + local->call_count = call_count; - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t *sh = NULL; + fd = fd_create (local->loc.inode, frame->root->pid); + sh->healing_fd = fd; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + /* open sinks */ + for (i = 0; i < priv->child_count; i++) { + if(!local->child_up[i]) + continue; - if (sh->healing_fd_opened) { - /* caller has opened the fd for us already, so skip open */ + STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->open, + &local->loc, + O_RDWR|O_LARGEFILE, fd, NULL); - afr_sh_data_lock (frame, this); - return 0; + if (!--call_count) + break; } - call_count = afr_up_children_count (priv->child_count, local->child_up); - local->call_count = call_count; + return 0; +} - fd = fd_create (local->loc.inode, frame->root->pid); - sh->healing_fd = fd; +void +afr_sh_non_reg_fix (call_frame_t *frame, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + afr_local_t *local = NULL; + int i = 0; + + if (op_ret < 0) { + afr_sh_data_fail (frame, this); + return; + } - /* open sinks */ - for (i = 0; i < priv->child_count; i++) { - if(!local->child_up[i]) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->open, - &local->loc, - O_RDWR|O_LARGEFILE, fd, 0); - - if (!--call_count) - break; - } + local = frame->local; + sh = &local->self_heal; + priv = this->private; - return 0; + for (i = 0; i < priv->child_count ; i++) { + if (1 == local->child_up[i]) + sh->success[i] = 1; + } + + afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION, + afr_sh_data_erase_pending_cbk, + afr_sh_data_finish); +} + +int +afr_sh_non_reg_lock_success (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + sh->data_lock_held = _gf_true; + afr_sh_common_lookup (frame, this, &local->loc, + afr_sh_non_reg_fix, NULL, + AFR_LOOKUP_FAIL_CONFLICTS | + AFR_LOOKUP_FAIL_MISSING_GFIDS, + NULL); + return 0; } +gf_boolean_t +afr_can_start_data_self_heal (afr_self_heal_t *sh, afr_private_t *priv) +{ + if (sh->force_confirm_spb) + return _gf_true; + if (sh->do_data_self_heal && + afr_data_self_heal_enabled (priv->data_self_heal)) + return _gf_true; + return _gf_false; +} int afr_self_heal_data (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = this->private; - + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = this->private; + int ret = -1; - local = frame->local; - sh = &local->self_heal; + local = frame->local; + sh = &local->self_heal; + + sh->sh_type_in_action = AFR_SELF_HEAL_DATA; + + if (afr_can_start_data_self_heal (sh, priv)) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); + ret = afr_inodelk_init (&local->internal_lock.inodelk[1], + priv->sh_domain, priv->child_count); + if (ret < 0) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_data_done (frame, this); + return 0; + } - if (sh->need_data_self_heal && priv->data_self_heal) { - afr_sh_data_open (frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "not doing data self heal on %s", - local->loc.path); - afr_sh_data_done (frame, this); - } + if (IA_ISREG (sh->type)) { + afr_sh_data_open (frame, this); + } else { + afr_sh_data_lock (frame, this, 0, 0, _gf_true, + this->name, + afr_sh_non_reg_lock_success, + afr_sh_data_fail); + } + } else { + gf_log (this->name, GF_LOG_TRACE, + "not doing data self heal on %s", + local->loc.path); + afr_sh_data_done (frame, this); + } - return 0; + return 0; } - diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index db12c3b83..53491a1d7 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2009 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -49,39 +40,30 @@ #include "afr-self-heal.h" #include "afr-self-heal-common.h" -int -afr_sh_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this); +#define AFR_INIT_SH_FRAME_VALS(_frame, _local, _sh, _sh_frame, _sh_local, _sh_sh)\ + do {\ + _local = _frame->local;\ + _sh = &_local->self_heal;\ + _sh_frame = _sh->sh_frame;\ + _sh_local = _sh_frame->local;\ + _sh_sh = &_sh_local->self_heal;\ + } while (0); int +afr_sh_entry_impunge_create_file (call_frame_t *impunge_frame, xlator_t *this, + int child_index); +int afr_sh_entry_done (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - /* - TODO: cleanup sh->* - */ - - if (sh->healing_fd) - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; - -/* for (i = 0; i < priv->child_count; i++) { */ -/* sh->locked_nodes[i] = 0; */ -/* } */ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; - gf_log (this->name, GF_LOG_TRACE, - "self heal of %s completed", - local->loc.path); + local = frame->local; + sh = &local->self_heal; - sh->completion_cbk (frame, this); + sh->completion_cbk (frame, this); - return 0; + return 0; } @@ -104,278 +86,230 @@ afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this) int afr_sh_entry_finish (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - gf_log (this->name, GF_LOG_TRACE, - "finishing entry selfheal of %s", local->loc.path); + gf_log (this->name, GF_LOG_TRACE, + "finishing entry selfheal of %s", local->loc.path); - afr_sh_entry_unlock (frame, this); + afr_sh_entry_unlock (frame, this); - return 0; + return 0; } int afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xattr, dict_t *xdata) { - int call_count = 0; + long i = 0; + int call_count = 0; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_local_t *orig_local = NULL; + call_frame_t *orig_frame = NULL; + afr_private_t *priv = NULL; + int32_t read_child = -1; + + local = frame->local; + priv = this->private; + sh = &local->self_heal; + i = (long)cookie; + + + afr_children_add_child (sh->fresh_children, i, priv->child_count); + if (op_ret == -1) { + gf_log (this->name, GF_LOG_INFO, + "%s: failed to erase pending xattrs on %s (%s)", + local->loc.path, priv->children[i]->name, + strerror (op_errno)); + } - call_count = afr_frame_return (frame); + call_count = afr_frame_return (frame); - if (call_count == 0) - afr_sh_entry_finish (frame, this); + if (call_count == 0) { + if (sh->source == -1) { + //this happens if the forced merge option is set + read_child = sh->fresh_children[0]; + } else { + read_child = sh->source; + } + afr_inode_set_read_ctx (this, sh->inode, read_child, + sh->fresh_children); + orig_frame = sh->orig_frame; + orig_local = orig_frame->local; + + if (sh->source != -1) { + orig_local->cont.lookup.buf.ia_nlink = sh->buf[sh->source].ia_nlink; + } + + afr_sh_entry_finish (frame, this); + } - return 0; + return 0; } int afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; - int need_unwind = 0; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + local = frame->local; + sh = &local->self_heal; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + if (sh->entries_skipped) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + goto out; + } + afr_sh_erase_pending (frame, this, AFR_ENTRY_TRANSACTION, + afr_sh_entry_erase_pending_cbk, + afr_sh_entry_finish); + return 0; +out: + afr_sh_entry_finish (frame, this); + return 0; +} - afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, sh->success, - priv->child_count, AFR_ENTRY_TRANSACTION); - erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, - gf_afr_mt_dict_t); - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; +static int +next_active_source (call_frame_t *frame, xlator_t *this, + int current_active_source) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int source = -1; + int next_active_source = -1; + int i = 0; - erase_xattr[i] = get_new_dict(); - dict_ref (erase_xattr[i]); - } - } + priv = this->private; + local = frame->local; + sh = &local->self_heal; - if (call_count == 0) - need_unwind = 1; - - afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, - priv->child_count, AFR_ENTRY_TRANSACTION); - - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "erasing pending flags from %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_entry_erase_pending_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, erase_xattr[i]); - if (!--call_count) - break; - } - - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); - } - } - GF_FREE (erase_xattr); - - if (need_unwind) - afr_sh_entry_finish (frame, this); + source = sh->source; - return 0; -} + if (source != -1) { + if (current_active_source != source) + next_active_source = source; + goto out; + } + /* + the next active sink becomes the source for the + 'conservative decision' of merging all entries + */ + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) + && (local->child_up[i] == 1) + && (i > current_active_source)) { -static int -next_active_source (call_frame_t *frame, xlator_t *this, - int current_active_source) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int source = -1; - int next_active_source = -1; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - source = sh->source; - - if (source != -1) { - if (current_active_source != source) - next_active_source = source; - goto out; - } - - /* - the next active sink becomes the source for the - 'conservative decision' of merging all entries - */ - - for (i = 0; i < priv->child_count; i++) { - if ((sh->sources[i] == 0) - && (local->child_up[i] == 1) - && (i > current_active_source)) { - - next_active_source = i; - break; - } - } + next_active_source = i; + break; + } + } out: - return next_active_source; + return next_active_source; } static int next_active_sink (call_frame_t *frame, xlator_t *this, - int current_active_sink) + int current_active_sink) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int next_active_sink = -1; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - /* - the next active sink becomes the source for the - 'conservative decision' of merging all entries - */ - - for (i = 0; i < priv->child_count; i++) { - if ((sh->sources[i] == 0) - && (local->child_up[i] == 1) - && (i > current_active_sink)) { - - next_active_sink = i; - break; - } - } - - return next_active_sink; -} - + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int next_active_sink = -1; + int i = 0; -int -build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) -{ - int ret = -1; + priv = this->private; + local = frame->local; + sh = &local->self_heal; - if (!child) { - goto out; - } + /* + the next active sink becomes the source for the + 'conservative decision' of merging all entries + */ - if (strcmp (parent->path, "/") == 0) - ret = gf_asprintf ((char **)&child->path, "/%s", name); - else - ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path, - name); + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) + && (local->child_up[i] == 1) + && (i > current_active_sink)) { - if (-1 == ret) { - gf_log (this->name, GF_LOG_ERROR, - "asprintf failed while setting child path"); + next_active_sink = i; + break; + } } - if (!child->path) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - child->name = strrchr (child->path, '/'); - if (child->name) - child->name++; - - child->parent = inode_ref (parent->inode); - child->inode = inode_new (parent->inode->table); - - if (!child->inode) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - ret = 0; -out: - if (ret == -1) - loc_wipe (child); - - return ret; + return next_active_sink; } +int +afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this); + +int +afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this); int afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this); int afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src); + int active_src); int afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this, - int active_src) + int active_src, int32_t op_ret, + int32_t op_errno) { - int call_count = 0; + int call_count = 0; - call_count = afr_frame_return (frame); + call_count = afr_frame_return (frame); - if (call_count == 0) - afr_sh_entry_expunge_subvol (frame, this, active_src); + if (call_count == 0) + afr_sh_entry_expunge_subvol (frame, this, active_src); - return 0; + return 0; } int afr_sh_entry_expunge_parent_setattr_cbk (call_frame_t *expunge_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) + struct iatt *preop, struct iatt *postop, + dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; call_frame_t *frame = NULL; - - int active_src = (long) cookie; + int active_src = (long) cookie; + afr_self_heal_t *sh = NULL; + afr_local_t *local = NULL; priv = this->private; expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; + expunge_sh = &expunge_local->self_heal; + frame = expunge_sh->sh_frame; + local = frame->local; + sh = &local->self_heal; if (op_ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_ERROR, "setattr on parent directory of %s on subvolume %s failed: %s", expunge_local->loc.path, priv->children[active_src]->name, strerror (op_errno)); } - AFR_STACK_DESTROY (expunge_frame); - afr_sh_entry_expunge_entry_done (frame, this, active_src); + AFR_STACK_DESTROY (expunge_frame); + sh->expunge_done (frame, this, active_src, op_ret, op_errno); return 0; } @@ -383,41 +317,37 @@ afr_sh_entry_expunge_parent_setattr_cbk (call_frame_t *expunge_frame, int afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, + xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int active_src = 0; - call_frame_t *frame = NULL; - - int32_t valid = 0; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - - active_src = (long) cookie; - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "removed %s on %s", - expunge_local->loc.path, - priv->children[active_src]->name); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "removing %s on %s failed (%s)", - expunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - } + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + int active_src = 0; + int32_t valid = 0; + + priv = this->private; + expunge_local = expunge_frame->local; + expunge_sh = &expunge_local->self_heal; + + active_src = (long) cookie; + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "removed %s on %s", + expunge_local->loc.path, + priv->children[active_src]->name); + } else { + gf_log (this->name, GF_LOG_INFO, + "removing %s on %s failed (%s)", + expunge_local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + } valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - afr_build_parent_loc (&expunge_sh->parent_loc, &expunge_local->loc); STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_parent_setattr_cbk, (void *) (long) active_src, @@ -425,7 +355,7 @@ afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie, priv->children[active_src]->fops->setattr, &expunge_sh->parent_loc, &expunge_sh->parentbuf, - valid); + valid, NULL); return 0; } @@ -435,23 +365,23 @@ int afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this, int active_src) { - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; - priv = this->private; - expunge_local = expunge_frame->local; + priv = this->private; + expunge_local = expunge_frame->local; - gf_log (this->name, GF_LOG_TRACE, - "expunging file %s on %s", - expunge_local->loc.path, priv->children[active_src]->name); + gf_log (this->name, GF_LOG_TRACE, + "expunging file %s on %s", + expunge_local->loc.path, priv->children[active_src]->name); - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->unlink, - &expunge_local->loc); + STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, + (void *) (long) active_src, + priv->children[active_src], + priv->children[active_src]->fops->unlink, + &expunge_local->loc, 0, NULL); - return 0; + return 0; } @@ -460,700 +390,937 @@ int afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this, int active_src) { - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; - priv = this->private; - expunge_local = expunge_frame->local; + priv = this->private; + expunge_local = expunge_frame->local; - gf_log (this->name, GF_LOG_DEBUG, - "expunging directory %s on %s", - expunge_local->loc.path, priv->children[active_src]->name); + gf_log (this->name, GF_LOG_DEBUG, + "expunging directory %s on %s", + expunge_local->loc.path, priv->children[active_src]->name); - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->rmdir, - &expunge_local->loc, 1); + STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, + (void *) (long) active_src, + priv->children[active_src], + priv->children[active_src]->fops->rmdir, + &expunge_local->loc, 1, NULL); - return 0; + return 0; } int afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, - int active_src, struct iatt *buf) + int active_src, struct iatt *buf, + struct iatt *parentbuf) { - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int source = 0; - call_frame_t *frame = NULL; - int type = 0; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - source = expunge_sh->source; - - type = buf->ia_type; - - switch (type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - case IA_IFLNK: - afr_sh_entry_expunge_unlink (expunge_frame, this, active_src); + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + call_frame_t *frame = NULL; + int type = 0; + afr_self_heal_t *sh = NULL; + afr_local_t *local = NULL; + loc_t *loc = NULL; + + priv = this->private; + expunge_local = expunge_frame->local; + expunge_sh = &expunge_local->self_heal; + frame = expunge_sh->sh_frame; + local = frame->local; + sh = &local->self_heal; + loc = &expunge_local->loc; + + type = buf->ia_type; + if (loc->parent && uuid_is_null (loc->parent->gfid)) + uuid_copy (loc->pargfid, parentbuf->ia_gfid); + + switch (type) { + case IA_IFSOCK: + case IA_IFREG: + case IA_IFBLK: + case IA_IFCHR: + case IA_IFIFO: + case IA_IFLNK: + afr_sh_entry_expunge_unlink (expunge_frame, this, active_src); break; - case IA_IFDIR: - afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src); - break; - default: - gf_log (this->name, GF_LOG_ERROR, - "%s has unknown file type on %s: 0%o", - expunge_local->loc.path, - priv->children[source]->name, type); - goto out; - break; - } - - return 0; + case IA_IFDIR: + afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "%s has unknown file type on %s: 0%o", + expunge_local->loc.path, + priv->children[active_src]->name, type); + goto out; + break; + } + + return 0; out: - AFR_STACK_DESTROY (expunge_frame); - afr_sh_entry_expunge_entry_done (frame, this, active_src); + AFR_STACK_DESTROY (expunge_frame); + sh->expunge_done (frame, this, active_src, -1, EINVAL); - return 0; + return 0; } int afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *x, - struct iatt *postparent) + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *buf, dict_t *x, + struct iatt *postparent) { - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - call_frame_t *frame = NULL; - int active_src = 0; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - active_src = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_TRACE, - "lookup of %s on %s failed (%s)", - expunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - goto out; - } - - afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf); - - return 0; + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + call_frame_t *frame = NULL; + int active_src = 0; + afr_self_heal_t *sh = NULL; + afr_local_t *local = NULL; + + priv = this->private; + expunge_local = expunge_frame->local; + expunge_sh = &expunge_local->self_heal; + frame = expunge_sh->sh_frame; + active_src = (long) cookie; + local = frame->local; + sh = &local->self_heal; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "lookup of %s on %s failed (%s)", + expunge_local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + goto out; + } + + afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf, + postparent); + + return 0; out: - AFR_STACK_DESTROY (expunge_frame); - afr_sh_entry_expunge_entry_done (frame, this, active_src); + AFR_STACK_DESTROY (expunge_frame); + sh->expunge_done (frame, this, active_src, op_ret, op_errno); - return 0; + return 0; } int afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this, - int active_src) + int active_src) { - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", - expunge_local->loc.path, priv->children[active_src]->name); - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->lookup, - &expunge_local->loc, 0); - - return 0; -} + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + + priv = this->private; + expunge_local = expunge_frame->local; + + gf_log (this->name, GF_LOG_TRACE, + "looking up %s on %s", + expunge_local->loc.path, priv->children[active_src]->name); + STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk, + (void *) (long) active_src, + priv->children[active_src], + priv->children[active_src]->fops->lookup, + &expunge_local->loc, NULL); + + return 0; +} int afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, + xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *x, struct iatt *postparent) { - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int source = 0; - call_frame_t *frame = NULL; - int active_src = 0; - - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - active_src = expunge_sh->active_source; - source = (long) cookie; - - if (op_ret == -1 && op_errno == ENOENT && postparent) { - - gf_log (this->name, GF_LOG_TRACE, - "missing entry %s on %s", - expunge_local->loc.path, - priv->children[source]->name); - - expunge_sh->parentbuf = *postparent; - - afr_sh_entry_expunge_purge (expunge_frame, this, active_src); - - return 0; - } - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "%s exists under %s", - expunge_local->loc.path, - priv->children[source]->name); - } else { - gf_log (this->name, GF_LOG_TRACE, - "looking up %s under %s failed (%s)", - expunge_local->loc.path, - priv->children[source]->name, - strerror (op_errno)); - } - - AFR_STACK_DESTROY (expunge_frame); - afr_sh_entry_expunge_entry_done (frame, this, active_src); - - return 0; + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + int source = 0; + call_frame_t *frame = NULL; + int active_src = 0; + int need_expunge = 0; + afr_self_heal_t *sh = NULL; + afr_local_t *local = NULL; + + priv = this->private; + expunge_local = expunge_frame->local; + expunge_sh = &expunge_local->self_heal; + frame = expunge_sh->sh_frame; + active_src = expunge_sh->active_source; + source = (long) cookie; + local = frame->local; + sh = &local->self_heal; + + if (op_ret == -1 && op_errno == ENOENT) + need_expunge = 1; + else if (op_ret == -1) + goto out; + + if (!uuid_is_null (expunge_sh->entrybuf.ia_gfid) && + !uuid_is_null (buf->ia_gfid) && + (uuid_compare (expunge_sh->entrybuf.ia_gfid, buf->ia_gfid) != 0)) { + char uuidbuf1[64]; + char uuidbuf2[64]; + gf_log (this->name, GF_LOG_DEBUG, + "entry %s found on %s with mismatching gfid (%s/%s)", + expunge_local->loc.path, + priv->children[source]->name, + uuid_utoa_r (expunge_sh->entrybuf.ia_gfid, uuidbuf1), + uuid_utoa_r (buf->ia_gfid, uuidbuf2)); + need_expunge = 1; + } + + if (need_expunge) { + gf_log (this->name, GF_LOG_INFO, + "Entry %s is missing on %s and deleting from " + "replica's other bricks", + expunge_local->loc.path, + priv->children[source]->name); + + if (postparent) + expunge_sh->parentbuf = *postparent; + + afr_sh_entry_expunge_purge (expunge_frame, this, active_src); + + return 0; + } + +out: + if (op_ret == 0) { + gf_log (this->name, GF_LOG_TRACE, + "%s exists under %s", + expunge_local->loc.path, + priv->children[source]->name); + } else { + gf_log (this->name, GF_LOG_INFO, + "looking up %s under %s failed (%s)", + expunge_local->loc.path, + priv->children[source]->name, + strerror (op_errno)); + } + + AFR_STACK_DESTROY (expunge_frame); + sh->expunge_done (frame, this, active_src, op_ret, op_errno); + + return 0; } +static gf_boolean_t +can_skip_entry_self_heal (char *name, loc_t *parent_loc) +{ + if (strcmp (name, ".") == 0) { + return _gf_true; + } else if (strcmp (name, "..") == 0) { + return _gf_true; + } else if (loc_is_root (parent_loc) && + (strcmp (name, GF_REPLICATE_TRASH_DIR) == 0)) { + return _gf_true; + } + return _gf_false; +} int afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, - char *name) + gf_dirent_t *entry) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int ret = -1; - call_frame_t *expunge_frame = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int active_src = 0; - int source = 0; - int op_errno = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - source = sh->source; - - if ((strcmp (name, ".") == 0) - || (strcmp (name, "..") == 0) - || ((strcmp (local->loc.path, "/") == 0) - && (strcmp (name, GF_REPLICATE_TRASH_DIR) == 0))) { + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int ret = -1; + call_frame_t *expunge_frame = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + int active_src = 0; + int source = 0; + int op_errno = 0; + char *name = NULL; + int op_ret = -1; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; - gf_log (this->name, GF_LOG_TRACE, - "skipping inspection of %s under %s", - name, local->loc.path); - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "inspecting existance of %s under %s", - name, local->loc.path); - - expunge_frame = copy_frame (frame); - if (!expunge_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - ALLOC_OR_GOTO (expunge_local, afr_local_t, out); - - expunge_frame->local = expunge_local; - expunge_sh = &expunge_local->self_heal; - expunge_sh->sh_frame = frame; - expunge_sh->active_source = active_src; - - ret = build_child_loc (this, &expunge_local->loc, &local->loc, name); - if (ret != 0) { - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", expunge_local->loc.path, - priv->children[source]->name); - - STACK_WIND_COOKIE (expunge_frame, - afr_sh_entry_expunge_entry_cbk, - (void *) (long) source, - priv->children[source], - priv->children[source]->fops->lookup, - &expunge_local->loc, 0); - - ret = 0; + active_src = sh->active_source; + source = sh->source; + sh->expunge_done = afr_sh_entry_expunge_entry_done; + + name = entry->d_name; + if (can_skip_entry_self_heal (name, &local->loc)) { + op_ret = 0; + goto out; + } + + gf_log (this->name, GF_LOG_TRACE, + "inspecting existence of %s under %s", + name, local->loc.path); + + expunge_frame = copy_frame (frame); + if (!expunge_frame) { + op_errno = ENOMEM; + goto out; + } + + AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out); + + expunge_frame->local = expunge_local; + expunge_sh = &expunge_local->self_heal; + expunge_sh->sh_frame = frame; + expunge_sh->active_source = active_src; + expunge_sh->entrybuf = entry->d_stat; + loc_copy (&expunge_sh->parent_loc, &local->loc); + + ret = afr_build_child_loc (this, &expunge_local->loc, &local->loc, + name); + if (ret != 0) { + op_errno = EINVAL; + goto out; + } + + gf_log (this->name, GF_LOG_TRACE, + "looking up %s on %s", expunge_local->loc.path, + priv->children[source]->name); + + STACK_WIND_COOKIE (expunge_frame, + afr_sh_entry_expunge_entry_cbk, + (void *) (long) source, + priv->children[source], + priv->children[source]->fops->lookup, + &expunge_local->loc, NULL); + + ret = 0; out: - if (ret == -1) - afr_sh_entry_expunge_entry_done (frame, this, active_src); + if (ret == -1) + sh->expunge_done (frame, this, active_src, op_ret, op_errno); - return 0; + return 0; } int afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) + xlator_t *this, + int32_t op_ret, int32_t op_errno, + gf_dirent_t *entries, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - gf_dirent_t *entry = NULL; - off_t last_offset = 0; - int active_src = 0; - int entry_count = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - - if (op_ret <= 0) { - if (op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "readdir of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_TRACE, - "readdir of %s on subvolume %s complete", - local->loc.path, - priv->children[active_src]->name); - } - - afr_sh_entry_expunge_all (frame, this); - return 0; - } - - list_for_each_entry (entry, &entries->list, list) { - last_offset = entry->d_off; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + gf_dirent_t *entry = NULL; + off_t last_offset = 0; + int active_src = 0; + int entry_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + active_src = sh->active_source; + + if (op_ret <= 0) { + if (op_ret < 0) { + gf_log (this->name, GF_LOG_INFO, + "readdir of %s on subvolume %s failed (%s)", + local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_TRACE, + "readdir of %s on subvolume %s complete", + local->loc.path, + priv->children[active_src]->name); + } + + afr_sh_entry_expunge_all (frame, this); + return 0; + } + + list_for_each_entry (entry, &entries->list, list) { + last_offset = entry->d_off; entry_count++; - } + } - gf_log (this->name, GF_LOG_TRACE, - "readdir'ed %d entries from %s", - entry_count, priv->children[active_src]->name); + gf_log (this->name, GF_LOG_TRACE, + "readdir'ed %d entries from %s", + entry_count, priv->children[active_src]->name); - sh->offset = last_offset; - local->call_count = entry_count; + sh->offset = last_offset; + local->call_count = entry_count; - list_for_each_entry (entry, &entries->list, list) { - afr_sh_entry_expunge_entry (frame, this, entry->d_name); - } + list_for_each_entry (entry, &entries->list, list) { + afr_sh_entry_expunge_entry (frame, this, entry); + } - return 0; + return 0; } int afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src) + int active_src) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; - priv = this->private; - local = frame->local; - sh = &local->self_heal; + priv = this->private; + local = frame->local; + sh = &local->self_heal; - STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk, - priv->children[active_src], - priv->children[active_src]->fops->readdir, - sh->healing_fd, sh->block_size, sh->offset); + STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk, + priv->children[active_src], + priv->children[active_src]->fops->readdirp, + sh->healing_fd, sh->block_size, sh->offset, NULL); - return 0; + return 0; } int afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int active_src = -1; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int active_src = -1; - priv = this->private; - local = frame->local; - sh = &local->self_heal; + priv = this->private; + local = frame->local; + sh = &local->self_heal; - sh->offset = 0; + sh->offset = 0; - if (sh->source == -1) { - gf_log (this->name, GF_LOG_TRACE, - "no active sources for %s to expunge entries", - local->loc.path); - goto out; - } + if (sh->source == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "no active sources for %s to expunge entries", + local->loc.path); + goto out; + } - active_src = next_active_sink (frame, this, sh->active_source); - sh->active_source = active_src; + active_src = next_active_sink (frame, this, sh->active_source); + sh->active_source = active_src; - if (sh->op_failed) { - goto out; - } + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { + goto out; + } - if (active_src == -1) { - /* completed creating missing files on all subvolumes */ - goto out; - } + if (active_src == -1) { + /* completed creating missing files on all subvolumes */ + goto out; + } - gf_log (this->name, GF_LOG_TRACE, - "expunging entries of %s on %s to other sinks", - local->loc.path, priv->children[active_src]->name); + gf_log (this->name, GF_LOG_TRACE, + "expunging entries of %s on %s to other sinks", + local->loc.path, priv->children[active_src]->name); - afr_sh_entry_expunge_subvol (frame, this, active_src); + afr_sh_entry_expunge_subvol (frame, this, active_src); - return 0; + return 0; out: - afr_sh_entry_erase_pending (frame, this); - return 0; + afr_sh_entry_impunge_all (frame, this); + return 0; } int -afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this); - -int -afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src); - -int afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this, - int active_src) + int32_t op_ret, int32_t op_errno) { - int call_count = 0; - - call_count = afr_frame_return (frame); + int call_count = 0; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; - if (call_count == 0) - afr_sh_entry_impunge_subvol (frame, this, active_src); + local = frame->local; + sh = &local->self_heal; + if (op_ret < 0) + sh->entries_skipped = _gf_true; + call_count = afr_frame_return (frame); + if (call_count == 0) + afr_sh_entry_impunge_subvol (frame, this); - return 0; + return 0; } - -int -afr_sh_entry_impunge_setattr_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) +void +afr_sh_entry_call_impunge_done (call_frame_t *impunge_frame, xlator_t *this, + int32_t op_ret, int32_t op_errno) { - int call_count = 0; - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *frame = NULL; - int active_src = 0; - int child_index = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - active_src = sh->active_source; - child_index = (long) cookie; - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "setattr done for %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "setattr (%s) on %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - } - - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - AFR_STACK_DESTROY (impunge_frame); - afr_sh_entry_impunge_entry_done (frame, this, active_src); - } - - return 0; -} + afr_local_t *impunge_local = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_self_heal_t *impunge_sh = NULL; + call_frame_t *frame = NULL; + AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, + frame, local, sh); + + AFR_STACK_DESTROY (impunge_frame); + sh->impunge_done (frame, this, op_ret, op_errno); +} int -afr_sh_entry_impunge_xattrop_cbk (call_frame_t *impunge_frame, void *cookie, +afr_sh_entry_impunge_setattr_cbk (call_frame_t *impunge_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) + struct iatt *preop, struct iatt *postop, + dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = 0; - - struct iatt stbuf; - int32_t valid = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - - child_index = (long) cookie; - - gf_log (this->name, GF_LOG_TRACE, - "setting ownership of %s on %s to %d/%d", - impunge_local->loc.path, - priv->children[child_index]->name, - impunge_local->cont.lookup.buf.ia_uid, - impunge_local->cont.lookup.buf.ia_gid); - - stbuf.ia_atime = impunge_local->cont.lookup.buf.ia_atime; - stbuf.ia_atime_nsec = impunge_local->cont.lookup.buf.ia_atime_nsec; - stbuf.ia_mtime = impunge_local->cont.lookup.buf.ia_mtime; - stbuf.ia_mtime_nsec = impunge_local->cont.lookup.buf.ia_mtime_nsec; - - stbuf.ia_uid = impunge_local->cont.lookup.buf.ia_uid; - stbuf.ia_gid = impunge_local->cont.lookup.buf.ia_gid; - - valid = GF_SET_ATTR_UID | GF_SET_ATTR_GID | - GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_setattr_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->setattr, - &impunge_local->loc, - &stbuf, valid); + int call_count = 0; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + int child_index = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + child_index = (long) cookie; + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "setattr done for %s on %s", + impunge_local->loc.path, + priv->children[child_index]->name); + } else { + gf_log (this->name, GF_LOG_INFO, + "setattr (%s) on %s failed (%s)", + impunge_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + } + + call_count = afr_frame_return (impunge_frame); + if (call_count == 0) { + afr_sh_entry_call_impunge_done (impunge_frame, this, + 0, op_errno); + } + return 0; } - int afr_sh_entry_impunge_parent_setattr_cbk (call_frame_t *setattr_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) + struct iatt *preop, struct iatt *postop, + dict_t *xdata) { - loc_t *parent_loc = cookie; + int call_count = 0; + afr_local_t *setattr_local = NULL; + setattr_local = setattr_frame->local; if (op_ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "setattr on parent directory failed: %s", - strerror (op_errno)); + gf_log (this->name, GF_LOG_INFO, + "setattr on parent directory (%s) failed: %s", + setattr_local->loc.path, strerror (op_errno)); } - loc_wipe (parent_loc); + call_count = afr_frame_return (setattr_frame); + if (call_count == 0) + AFR_STACK_DESTROY (setattr_frame); + return 0; +} + +int +afr_sh_entry_impunge_setattr (call_frame_t *impunge_frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_local_t *setattr_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + call_frame_t *setattr_frame = NULL; + int32_t valid = 0; + int32_t op_errno = 0; + int child_index = 0; + int call_count = 0; + int i = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; - GF_FREE (parent_loc); + gf_log (this->name, GF_LOG_DEBUG, + "setting ownership of %s on %s to %d/%d", + impunge_local->loc.path, + priv->children[child_index]->name, + impunge_sh->entrybuf.ia_uid, + impunge_sh->entrybuf.ia_gid); - AFR_STACK_DESTROY (setattr_frame); + setattr_frame = copy_frame (impunge_frame); + if (!setattr_frame) { + op_errno = ENOMEM; + goto out; + } + AFR_LOCAL_ALLOC_OR_GOTO (setattr_frame->local, out); + setattr_local = setattr_frame->local; + call_count = afr_errno_count (NULL, impunge_sh->child_errno, + priv->child_count, 0); + loc_copy (&setattr_local->loc, &impunge_sh->parent_loc); + impunge_local->call_count = call_count; + setattr_local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (impunge_sh->child_errno[i]) + continue; + valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; + STACK_WIND_COOKIE (setattr_frame, + afr_sh_entry_impunge_parent_setattr_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->setattr, + &setattr_local->loc, + &impunge_sh->parentbuf, valid, NULL); + + valid = GF_SET_ATTR_UID | GF_SET_ATTR_GID | + GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; + STACK_WIND_COOKIE (impunge_frame, + afr_sh_entry_impunge_setattr_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->setattr, + &impunge_local->loc, + &impunge_sh->entrybuf, valid, NULL); + call_count--; + } + GF_ASSERT (!call_count); + return 0; +out: + if (setattr_frame) + AFR_STACK_DESTROY (setattr_frame); + afr_sh_entry_call_impunge_done (impunge_frame, this, 0, op_errno); return 0; } +int +afr_sh_entry_impunge_xattrop_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t *xattr, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + int child_index = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + + child_index = (long) cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_INFO, + "%s: failed to perform xattrop on %s (%s)", + impunge_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + goto out; + } + + afr_sh_entry_impunge_setattr (impunge_frame, this); + return 0; +out: + afr_sh_entry_call_impunge_done (impunge_frame, this, + -1, op_errno); + return 0; +} + +int +afr_sh_entry_impunge_perform_xattrop (call_frame_t *impunge_frame, + xlator_t *this) +{ + int active_src = 0; + dict_t *xattr = NULL; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int32_t op_errno = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + active_src = impunge_sh->active_source; + + afr_prepare_new_entry_pending_matrix (impunge_local->pending, + afr_is_errno_unset, + impunge_sh->child_errno, + &impunge_sh->entrybuf, + priv->child_count); + xattr = dict_new (); + if (!xattr) { + op_errno = ENOMEM; + goto out; + } + + afr_set_pending_dict (priv, xattr, impunge_local->pending, active_src, + LOCAL_LAST); + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_xattrop_cbk, + (void *) (long) active_src, + priv->children[active_src], + priv->children[active_src]->fops->xattrop, + &impunge_local->loc, GF_XATTROP_ADD_ARRAY, xattr, NULL); + + if (xattr) + dict_unref (xattr); + return 0; +out: + afr_sh_entry_call_impunge_done (impunge_frame, this, + -1, op_errno); + return 0; +} int afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, + xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - int call_count = 0; - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *frame = NULL; - int active_src = 0; - int child_index = 0; - int pending_array[3] = {0, }; - dict_t *xattr = NULL; - int ret = 0; - int idx = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + int call_count = 0; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int child_index = 0; - call_frame_t *setattr_frame = NULL; - int32_t valid = 0; - loc_t *parent_loc = NULL; - struct iatt parentbuf; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - active_src = sh->active_source; + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; - child_index = (long) cookie; + child_index = (long) cookie; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "creation of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - goto out; - } + if (op_ret == -1) { + impunge_sh->child_errno[child_index] = op_errno; + gf_log (this->name, GF_LOG_ERROR, + "creation of %s on %s failed (%s)", + impunge_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + } else { + impunge_sh->child_errno[child_index] = 0; + } - inode->ia_type = stbuf->ia_type; + call_count = afr_frame_return (impunge_frame); + if (call_count == 0) { + if (!afr_errno_count (NULL, impunge_sh->child_errno, + priv->child_count, 0)) { + // new_file creation failed every where + afr_sh_entry_call_impunge_done (impunge_frame, this, + -1, op_errno); + goto out; + } + afr_sh_entry_impunge_perform_xattrop (impunge_frame, this); + } +out: + return 0; +} - xattr = get_new_dict (); - dict_ref (xattr); +int +afr_sh_entry_impunge_hardlink_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int call_count = 0; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + + if (IA_IFLNK == impunge_sh->entrybuf.ia_type) { + //For symlinks impunge is attempted un-conditionally + //So the file can already exist. + if ((op_ret < 0) && (op_errno == EEXIST)) + op_ret = 0; + } - idx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); - pending_array[idx] = hton32 (1); - if (IA_ISDIR (stbuf->ia_type)) - idx = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); - else - idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - pending_array[idx] = hton32 (1); + call_count = afr_frame_return (impunge_frame); + if (call_count == 0) + afr_sh_entry_call_impunge_done (impunge_frame, this, + op_ret, op_errno); - ret = dict_set_static_bin (xattr, priv->pending_key[child_index], - pending_array, sizeof (pending_array)); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); + return 0; +} - valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - parentbuf = impunge_sh->parentbuf; - setattr_frame = copy_frame (impunge_frame); +int +afr_sh_entry_impunge_hardlink (call_frame_t *impunge_frame, xlator_t *this, + int child_index) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + loc_t *loc = NULL; + struct iatt *buf = NULL; + loc_t oldloc = {0}; - parent_loc = GF_CALLOC (1, sizeof (*parent_loc), - gf_afr_mt_loc_t); - afr_build_parent_loc (parent_loc, &impunge_local->loc); + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + loc = &impunge_local->loc; + buf = &impunge_sh->entrybuf; - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_xattrop_cbk, - (void *) (long) child_index, - priv->children[active_src], - priv->children[active_src]->fops->xattrop, - &impunge_local->loc, GF_XATTROP_ADD_ARRAY, xattr); + oldloc.inode = inode_ref (loc->inode); + uuid_copy (oldloc.gfid, buf->ia_gfid); + gf_log (this->name, GF_LOG_DEBUG, "linking missing file %s on %s", + loc->path, priv->children[child_index]->name); - STACK_WIND_COOKIE (setattr_frame, afr_sh_entry_impunge_parent_setattr_cbk, - (void *) (long) parent_loc, + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_hardlink_cbk, + (void *) (long) child_index, priv->children[child_index], - priv->children[child_index]->fops->setattr, - parent_loc, &parentbuf, valid); + priv->children[child_index]->fops->link, + &oldloc, loc, NULL); + loc_wipe (&oldloc); + + return 0; +} - dict_unref (xattr); +int +afr_sh_nameless_lookup_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xattr, + struct iatt *postparent) +{ + if (op_ret < 0) { + afr_sh_entry_impunge_create_file (impunge_frame, this, + (long)cookie); + } else { + afr_sh_entry_impunge_hardlink (impunge_frame, this, + (long)cookie); + } + return 0; +} - return 0; +int +afr_sh_entry_impunge_check_hardlink (call_frame_t *impunge_frame, + xlator_t *this, + int child_index, struct iatt *stbuf) +{ + afr_private_t *priv = NULL; + call_frame_t *frame = NULL; + afr_local_t *impunge_local = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *impunge_sh = NULL; + afr_self_heal_t *sh = NULL; + loc_t *loc = NULL; + dict_t *xattr_req = NULL; + loc_t oldloc = {0}; + int ret = -1; + + priv = this->private; + AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, + frame, local, sh); + loc = &impunge_local->loc; + + xattr_req = dict_new (); + if (!xattr_req) + goto out; + oldloc.inode = inode_ref (loc->inode); + uuid_copy (oldloc.gfid, stbuf->ia_gfid); + STACK_WIND_COOKIE (impunge_frame, afr_sh_nameless_lookup_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->lookup, + &oldloc, xattr_req); + ret = 0; out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - AFR_STACK_DESTROY (impunge_frame); - afr_sh_entry_impunge_entry_done (frame, this, active_src); - } - - return 0; + if (xattr_req) + dict_unref (xattr_req); + loc_wipe (&oldloc); + if (ret) + sh->impunge_done (frame, this, -1, ENOMEM); + return 0; } - int afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *stbuf) + int child_index, struct iatt *stbuf) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - dict_t *dict = NULL; - - int ret = 0; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + dict_t *dict = NULL; + int ret = 0; - priv = this->private; - impunge_local = impunge_frame->local; + priv = this->private; + impunge_local = impunge_frame->local; - gf_log (this->name, GF_LOG_DEBUG, - "creating missing file %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); + gf_log (this->name, GF_LOG_DEBUG, + "creating missing file %s on %s", + impunge_local->loc.path, + priv->children[child_index]->name); dict = dict_new (); if (!dict) gf_log (this->name, GF_LOG_ERROR, "Out of memory"); + GF_ASSERT (!uuid_is_null (stbuf->ia_gfid)); ret = afr_set_dict_gfid (dict, stbuf->ia_gfid); if (ret) - gf_log (this->name, GF_LOG_DEBUG, "gfid set failed"); + gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed", + impunge_local->loc.path); + + /* + * Reason for adding GLUSTERFS_INTERNAL_FOP_KEY : + * + * Problem: + * While a brick is down in a replica pair, lets say the user creates + * one file(file-A) and a hard link to that file(h-file-A). After the + * brick comes back up, entry self-heal is attempted on parent dir of + * these two files. As part of readdir in self-heal it reads both the + * entries file-A and h-file-A for both of them it does name less lookup + * to check if there are any hardlinks already present in the + * destination brick. It finds that there are no hard links already + * present for files file-A, h-file-A. Self-heal does mknods for both + * file-A and h-file-A. This leads to file-A and h-file-A not being + * hardlinks anymore. + * + * Fix: (More like shrinking of race-window, the race itself is still + * present in posix-mknod). + * If mknod comes with the presence of GLUSTERFS_INTERNAL_FOP_KEY then + * posix_mknod checks if there are already any gfid-links and does + * link() instead of mknod. There still can be a race where two + * posix_mknods same gfid see that + * gfid-link file is not present and proceeds with mknods and result in + * two different files with same gfid. + */ + ret = dict_set_str (dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); + if (ret) + gf_log (this->name, GF_LOG_INFO, "%s: %s set failed", + impunge_local->loc.path, GLUSTERFS_INTERNAL_FOP_KEY); - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->mknod, - &impunge_local->loc, - st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type), - stbuf->ia_rdev, dict); + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->mknod, + &impunge_local->loc, + st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type), + makedev (ia_major (stbuf->ia_rdev), + ia_minor (stbuf->ia_rdev)), 0, dict); if (dict) dict_unref (dict); - return 0; + return 0; } int afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *stbuf) + int child_index, struct iatt *stbuf) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; dict_t *dict = NULL; int ret = 0; - priv = this->private; - impunge_local = impunge_frame->local; + priv = this->private; + impunge_local = impunge_frame->local; dict = dict_new (); if (!dict) { @@ -1162,73 +1329,76 @@ afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this, return 0; } + GF_ASSERT (!uuid_is_null (stbuf->ia_gfid)); ret = afr_set_dict_gfid (dict, stbuf->ia_gfid); if (ret) - gf_log (this->name, GF_LOG_DEBUG, "gfid set failed"); - - gf_log (this->name, GF_LOG_DEBUG, - "creating missing directory %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->mkdir, - &impunge_local->loc, + gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed", + impunge_local->loc.path); + + gf_log (this->name, GF_LOG_DEBUG, + "creating missing directory %s on %s", + impunge_local->loc.path, + priv->children[child_index]->name); + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->mkdir, + &impunge_local->loc, st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type), - dict); + 0, dict); if (dict) dict_unref (dict); - return 0; + return 0; } int afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index, const char *linkname) + int child_index, const char *linkname) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; dict_t *dict = NULL; struct iatt *buf = NULL; + int ret = 0; - int ret = 0; - - priv = this->private; - impunge_local = impunge_frame->local; + priv = this->private; + impunge_local = impunge_frame->local; - buf = &impunge_local->cont.symlink.buf; + buf = &impunge_local->cont.dir_fop.buf; dict = dict_new (); if (!dict) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - afr_sh_entry_impunge_entry_done (impunge_frame, this, 0); + afr_sh_entry_call_impunge_done (impunge_frame, this, + -1, ENOMEM); + goto out; } + GF_ASSERT (!uuid_is_null (buf->ia_gfid)); ret = afr_set_dict_gfid (dict, buf->ia_gfid); if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "dict set gfid failed"); + gf_log (this->name, GF_LOG_INFO, + "%s: dict set gfid failed", + impunge_local->loc.path); - gf_log (this->name, GF_LOG_DEBUG, - "creating missing symlink %s -> %s on %s", - impunge_local->loc.path, linkname, - priv->children[child_index]->name); + gf_log (this->name, GF_LOG_DEBUG, + "creating missing symlink %s -> %s on %s", + impunge_local->loc.path, linkname, + priv->children[child_index]->name); - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->symlink, - linkname, &impunge_local->loc, dict); + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->symlink, + linkname, &impunge_local->loc, 0, dict); if (dict) dict_unref (dict); - - return 0; +out: + return 0; } @@ -1237,50 +1407,45 @@ afr_sh_entry_impunge_symlink_unlink_cbk (call_frame_t *impunge_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = -1; - call_frame_t *frame = NULL; - int call_count = -1; - int active_src = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - active_src = impunge_sh->active_source; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "unlink of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - goto out; - } + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int child_index = -1; + int call_count = -1; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + + child_index = (long) cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_INFO, + "unlink of %s on %s failed (%s)", + impunge_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + goto out; + } afr_sh_entry_impunge_symlink (impunge_frame, this, child_index, impunge_sh->linkname); return 0; out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - AFR_STACK_DESTROY (impunge_frame); - afr_sh_entry_impunge_entry_done (frame, this, active_src); - } - - return 0; + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) + afr_sh_entry_call_impunge_done (impunge_frame, this, + op_ret, op_errno); + + return 0; } @@ -1288,22 +1453,22 @@ int afr_sh_entry_impunge_symlink_unlink (call_frame_t *impunge_frame, xlator_t *this, int child_index) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; - priv = this->private; - impunge_local = impunge_frame->local; + priv = this->private; + impunge_local = impunge_frame->local; - gf_log (this->name, GF_LOG_DEBUG, - "unlinking symlink %s with wrong target on %s", - impunge_local->loc.path, - priv->children[child_index]->name); + gf_log (this->name, GF_LOG_DEBUG, + "unlinking symlink %s with wrong target on %s", + impunge_local->loc.path, + priv->children[child_index]->name); STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_symlink_unlink_cbk, (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->unlink, - &impunge_local->loc); + &impunge_local->loc, 0, NULL); return 0; } @@ -1313,32 +1478,30 @@ int afr_sh_entry_impunge_readlink_sink_cbk (call_frame_t *impunge_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - const char *linkname, struct iatt *sbuf) + const char *linkname, struct iatt *sbuf, dict_t *xdata) { afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = -1; - call_frame_t *frame = NULL; - int call_count = -1; - int active_src = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - active_src = impunge_sh->active_source; - - child_index = (long) cookie; - - if ((op_ret == -1) && (op_errno != ENOENT)) { - gf_log (this->name, GF_LOG_DEBUG, - "readlink of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - goto out; - } + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int child_index = -1; + int call_count = -1; + int active_src = -1; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + active_src = impunge_sh->active_source; + + child_index = (long) cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + gf_log (this->name, GF_LOG_INFO, + "readlink of %s on %s failed (%s)", + impunge_local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + goto out; + } /* symlink doesn't exist on the sink */ @@ -1365,18 +1528,17 @@ afr_sh_entry_impunge_readlink_sink_cbk (call_frame_t *impunge_frame, void *cooki } out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - AFR_STACK_DESTROY (impunge_frame); - afr_sh_entry_impunge_entry_done (frame, this, active_src); - } - - return 0; + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) + afr_sh_entry_call_impunge_done (impunge_frame, this, + op_ret, op_errno); + + return 0; } @@ -1384,821 +1546,770 @@ int afr_sh_entry_impunge_readlink_sink (call_frame_t *impunge_frame, xlator_t *this, int child_index) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; - priv = this->private; - impunge_local = impunge_frame->local; + priv = this->private; + impunge_local = impunge_frame->local; - gf_log (this->name, GF_LOG_DEBUG, - "checking symlink target of %s on %s", - impunge_local->loc.path, priv->children[child_index]->name); + gf_log (this->name, GF_LOG_DEBUG, + "checking symlink target of %s on %s", + impunge_local->loc.path, priv->children[child_index]->name); - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_sink_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->readlink, - &impunge_local->loc, 4096); + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_sink_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->readlink, + &impunge_local->loc, 4096, NULL); - return 0; + return 0; } int afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - const char *linkname, struct iatt *sbuf) + xlator_t *this, + int32_t op_ret, int32_t op_errno, + const char *linkname, struct iatt *sbuf, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = -1; - call_frame_t *frame = NULL; - int call_count = -1; - int active_src = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - active_src = impunge_sh->active_source; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "readlink of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - goto out; - } + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int child_index = -1; + int call_count = -1; + int active_src = -1; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + active_src = impunge_sh->active_source; + + child_index = (long) cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_INFO, + "readlink of %s on %s failed (%s)", + impunge_local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + goto out; + } impunge_sh->linkname = gf_strdup (linkname); - afr_sh_entry_impunge_readlink_sink (impunge_frame, this, child_index); + afr_sh_entry_impunge_readlink_sink (impunge_frame, this, child_index); - return 0; + return 0; out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - AFR_STACK_DESTROY (impunge_frame); - afr_sh_entry_impunge_entry_done (frame, this, active_src); - } - - return 0; + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) + afr_sh_entry_call_impunge_done (impunge_frame, this, + op_ret, op_errno); + + return 0; } int afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *stbuf) + int child_index, struct iatt *stbuf) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int active_src = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - active_src = impunge_sh->active_source; - impunge_local->cont.symlink.buf = *stbuf; - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk, - (void *) (long) child_index, - priv->children[active_src], - priv->children[active_src]->fops->readlink, - &impunge_local->loc, 4096); - - return 0; -} + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int active_src = -1; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + active_src = impunge_sh->active_source; + impunge_local->cont.dir_fop.buf = *stbuf; + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk, + (void *) (long) child_index, + priv->children[active_src], + priv->children[active_src]->fops->readlink, + &impunge_local->loc, 4096, NULL); + return 0; +} int -afr_sh_entry_impunge_recreate_lookup_cbk (call_frame_t *impunge_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, - dict_t *xattr,struct iatt *postparent) +afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, + int child_index) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int active_src = 0; - int type = 0; - int child_index = 0; - call_frame_t *frame = NULL; - int call_count = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - - child_index = (long) cookie; - - active_src = impunge_sh->active_source; - - if (op_ret != 0) { - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s (for %s) failed (%s)", - impunge_local->loc.path, - priv->children[active_src]->name, - priv->children[child_index]->name, - strerror (op_errno)); - goto out; - } - - impunge_sh->parentbuf = *postparent; - - impunge_local->cont.lookup.buf = *buf; - type = buf->ia_type; - - switch (type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - afr_sh_entry_impunge_mknod (impunge_frame, this, - child_index, buf); - break; - case IA_IFLNK: - afr_sh_entry_impunge_readlink (impunge_frame, this, - child_index, buf); - break; - case IA_IFDIR: - afr_sh_entry_impunge_mkdir (impunge_frame, this, - child_index, buf); - break; - default: - gf_log (this->name, GF_LOG_ERROR, - "%s has unknown file type on %s: 0%o", - impunge_local->loc.path, - priv->children[active_src]->name, type); - goto out; - break; - } - - return 0; + call_frame_t *frame = NULL; + afr_local_t *impunge_local = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *impunge_sh = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + ia_type_t type = IA_INVAL; + int active_src = 0; + struct iatt *buf = NULL; + + AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, + frame, local, sh); + active_src = impunge_sh->active_source; + afr_update_loc_gfids (&impunge_local->loc, &impunge_sh->entrybuf, + &impunge_sh->parentbuf); + + buf = &impunge_sh->entrybuf; + type = buf->ia_type; + + switch (type) { + case IA_IFSOCK: + case IA_IFREG: + case IA_IFBLK: + case IA_IFCHR: + case IA_IFIFO: + case IA_IFLNK: + afr_sh_entry_impunge_check_hardlink (impunge_frame, this, + child_index, buf); + break; + case IA_IFDIR: + afr_sh_entry_impunge_mkdir (impunge_frame, this, + child_index, buf); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "%s has unknown file type on %s: 0%o", + impunge_local->loc.path, + priv->children[active_src]->name, type); + sh->impunge_done (frame, this, -1, EINVAL); + break; + } -out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - AFR_STACK_DESTROY (impunge_frame); - afr_sh_entry_impunge_entry_done (frame, this, active_src); - } - - return 0; + return 0; } - int -afr_sh_entry_impunge_recreate (call_frame_t *impunge_frame, xlator_t *this, - int child_index) +afr_sh_entry_impunge_create_file (call_frame_t *impunge_frame, xlator_t *this, + int child_index) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int active_src = 0; + call_frame_t *frame = NULL; + afr_local_t *impunge_local = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *impunge_sh = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + ia_type_t type = IA_INVAL; + int active_src = 0; + struct iatt *buf = NULL; + + AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, + frame, local, sh); + active_src = impunge_sh->active_source; + buf = &impunge_sh->entrybuf; + type = buf->ia_type; + + switch (type) { + case IA_IFSOCK: + case IA_IFREG: + case IA_IFBLK: + case IA_IFCHR: + case IA_IFIFO: + afr_sh_entry_impunge_mknod (impunge_frame, this, + child_index, buf); + break; + case IA_IFLNK: + afr_sh_entry_impunge_readlink (impunge_frame, this, + child_index, buf); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "%s has unknown file type on %s: 0%o", + impunge_local->loc.path, + priv->children[active_src]->name, type); + sh->impunge_done (frame, this, -1, EINVAL); + break; + } + + return 0; +} +gf_boolean_t +afr_sh_need_recreate (afr_self_heal_t *impunge_sh, unsigned int child, + unsigned int child_count) +{ + gf_boolean_t recreate = _gf_false; - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; + GF_ASSERT (impunge_sh->child_errno); - active_src = impunge_sh->active_source; + if (child == impunge_sh->active_source) + goto out; - STACK_WIND_COOKIE (impunge_frame, - afr_sh_entry_impunge_recreate_lookup_cbk, - (void *) (long) child_index, - priv->children[active_src], - priv->children[active_src]->fops->lookup, - &impunge_local->loc, 0); + if (IA_IFLNK == impunge_sh->entrybuf.ia_type) { + recreate = _gf_true; + goto out; + } - return 0; + if (impunge_sh->child_errno[child] == ENOENT) + recreate = _gf_true; +out: + return recreate; } +unsigned int +afr_sh_recreate_count (afr_self_heal_t *impunge_sh, int *sources, + unsigned int child_count) +{ + int count = 0; + int i = 0; + + for (i = 0; i < child_count; i++) { + if (afr_sh_need_recreate (impunge_sh, i, child_count)) + count++; + } + + return count; +} int -afr_sh_entry_impunge_entry_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *x, - struct iatt *postparent) +afr_sh_entry_call_impunge_recreate (call_frame_t *impunge_frame, + xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int call_count = 0; - int child_index = 0; - call_frame_t *frame = NULL; - int active_src = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - child_index = (long) cookie; - active_src = impunge_sh->active_source; - - if ((op_ret == -1 && op_errno == ENOENT) - || (IA_ISLNK (impunge_sh->impunging_entry_mode))) { - - /* - * A symlink's target might have changed, so - * always go down the recreate path for them. - */ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + unsigned int recreate_count = 0; + int i = 0; + int active_src = 0; - /* decrease call_count in recreate-callback */ - - gf_log (this->name, GF_LOG_TRACE, - "missing entry %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - afr_sh_entry_impunge_recreate (impunge_frame, this, - child_index); - return 0; - } - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "%s exists under %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - impunge_sh->parentbuf = *postparent; - } else { - gf_log (this->name, GF_LOG_TRACE, - "looking up %s under %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - } - - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - AFR_STACK_DESTROY (impunge_frame); - afr_sh_entry_impunge_entry_done (frame, this, active_src); - } - - return 0; + priv = this->private; + AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, + frame, local, sh); + active_src = impunge_sh->active_source; + impunge_sh->entrybuf = impunge_sh->buf[active_src]; + impunge_sh->parentbuf = impunge_sh->parentbufs[active_src]; + recreate_count = afr_sh_recreate_count (impunge_sh, sh->sources, + priv->child_count); + if (!recreate_count) { + afr_sh_entry_call_impunge_done (impunge_frame, this, 0, 0); + goto out; + } + impunge_local->call_count = recreate_count; + for (i = 0; i < priv->child_count; i++) { + if (!impunge_local->child_up[i]) { + impunge_sh->child_errno[i] = ENOTCONN; + continue; + } + if (!afr_sh_need_recreate (impunge_sh, i, priv->child_count)) { + impunge_sh->child_errno[i] = EEXIST; + continue; + } + } + for (i = 0; i < priv->child_count; i++) { + if (!afr_sh_need_recreate (impunge_sh, i, priv->child_count)) + continue; + (void)afr_sh_entry_impunge_create (impunge_frame, this, i); + recreate_count--; + } + GF_ASSERT (!recreate_count); +out: + return 0; } +void +afr_sh_entry_common_lookup_done (call_frame_t *impunge_frame, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + unsigned int gfid_miss_count = 0; + unsigned int children_up_count = 0; + uuid_t gfid = {0}; + int active_src = 0; + + priv = this->private; + AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, + frame, local, sh); + active_src = impunge_sh->active_source; + + if (op_ret < 0) + goto done; + if (impunge_sh->child_errno[active_src]) { + op_ret = -1; + op_errno = impunge_sh->child_errno[active_src]; + goto done; + } + < |
