diff options
Diffstat (limited to 'xlators')
544 files changed, 188878 insertions, 122271 deletions
diff --git a/xlators/Makefile.am b/xlators/Makefile.am index 4c94f5e44..f60fa85ce 100644 --- a/xlators/Makefile.am +++ b/xlators/Makefile.am @@ -1,3 +1,4 @@ -SUBDIRS = cluster storage protocol performance debug features encryption mount nfs mgmt +SUBDIRS = cluster storage protocol performance debug features encryption mount nfs mgmt system \ + playground CLEANFILES = diff --git a/xlators/bindings/Makefile.am b/xlators/bindings/Makefile.am deleted file mode 100644 index f77665802..000000000 --- a/xlators/bindings/Makefile.am +++ /dev/null @@ -1 +0,0 @@ -SUBDIRS = $(BINDINGS_SUBDIRS) diff --git a/xlators/bindings/python/src/Makefile.am b/xlators/bindings/python/src/Makefile.am deleted file mode 100644 index c0b9141c6..000000000 --- a/xlators/bindings/python/src/Makefile.am +++ /dev/null @@ -1,19 +0,0 @@ - -xlator_PROGRAMS = python.so - -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/bindings - -python_PYTHON = gluster.py glustertypes.py glusterstack.py - -pythondir = $(xlatordir)/python - -python_so_SOURCES = python.c - -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles \ - $(PYTHON_CPPLAGS) -DGLUSTER_PYTHON_PATH=\"$(pythondir)\" - -AM_LDFLAGS = $(PYTHON_LDFLAGS) - -CLEANFILES = - diff --git a/xlators/bindings/python/src/gluster.py b/xlators/bindings/python/src/gluster.py deleted file mode 100644 index ee0eb1310..000000000 --- a/xlators/bindings/python/src/gluster.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) 2007 Chris AtLee <chris@atlee.ca> -# This file is part of GlusterFS. -# -# GlusterFS is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published -# by the Free Software Foundation; either version 3 of the License, -# or (at your option) any later version. -# -# GlusterFS is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see -# <http://www.gnu.org/licenses/>. -from ctypes import * -from glustertypes import * -from glusterstack import * -import sys -import inspect - -libglusterfs = CDLL("libglusterfs.so") -_gf_log = libglusterfs._gf_log -_gf_log.restype = c_int32 -_gf_log.argtypes = [c_char_p, c_char_p, c_char_p, c_int32, c_int, c_char_p] - -gf_log_loglevel = c_int.in_dll(libglusterfs, "gf_log_loglevel") - -GF_LOG_NONE = 0 -GF_LOG_CRITICAL = 1 -GF_LOG_ERROR = 2 -GF_LOG_WARNING = 3 -GF_LOG_DEBUG = 4 - -def gf_log(module, level, fmt, *params): - if level <= gf_log_loglevel: - frame = sys._getframe(1) - _gf_log(module, frame.f_code.co_filename, frame.f_code.co_name, - frame.f_lineno, level, fmt, *params) - -class ComplexTranslator(object): - def __init__(self, xlator): - self.xlator = xlator_t.from_address(xlator) - - def __getattr__(self, item): - return getattr(self.xlator, item) diff --git a/xlators/bindings/python/src/glusterstack.py b/xlators/bindings/python/src/glusterstack.py deleted file mode 100644 index ba24c8165..000000000 --- a/xlators/bindings/python/src/glusterstack.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) 2007 Chris AtLee <chris@atlee.ca> -# This file is part of GlusterFS. -# -# GlusterFS is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published -# by the Free Software Foundation; either version 3 of the License, -# or (at your option) any later version. -# -# GlusterFS is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see -# <http://www.gnu.org/licenses/>. -from ctypes import * -from glustertypes import * - -libc = CDLL("libc.so.6") -calloc = libc.calloc -calloc.argtypes = [c_int, c_int] -calloc.restype = c_void_p - -# TODO: Can these be done in C somehow? -def stack_wind(frame, rfn, obj, fn, *params): - """Frame is a frame object""" - _new = cast(calloc(1, sizeof(call_frame_t)), POINTER(call_frame_t)) - _new[0].root = frame.root - _new[0].next = frame.root[0].frames.next - _new[0].prev = pointer(frame.root[0].frames) - if frame.root[0].frames.next: - frame.root[0].frames.next[0].prev = _new - frame.root[0].frames.next = _new - _new[0].this = obj - # TODO: Type checking like tmp_cbk? - _new[0].ret = rfn - _new[0].parent = pointer(frame) - _new[0].cookie = cast(_new, c_void_p) - # TODO: Initialize lock - #_new.lock.init() - frame.ref_count += 1 - fn(_new, obj, *params) - -def stack_unwind(frame, *params): - """Frame is a frame object""" - fn = frame[0].ret - parent = frame[0].parent[0] - parent.ref_count -= 1 - - op_ret = params[0] - op_err = params[1] - params = params[2:] - fn(parent, call_frame_t.from_address(frame[0].cookie), parent.this, - op_ret, op_err, *params) diff --git a/xlators/bindings/python/src/glustertypes.py b/xlators/bindings/python/src/glustertypes.py deleted file mode 100644 index e9069d07c..000000000 --- a/xlators/bindings/python/src/glustertypes.py +++ /dev/null @@ -1,167 +0,0 @@ -# Copyright (c) 2007 Chris AtLee <chris@atlee.ca> -# This file is part of GlusterFS. -# -# GlusterFS is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published -# by the Free Software Foundation; either version 3 of the License, -# or (at your option) any later version. -# -# GlusterFS is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see -# <http://www.gnu.org/licenses/>. -from ctypes import * -import collections - -# -# Forward declaration of some gluster types -# -class call_frame_t(Structure): - pass - -class call_ctx_t(Structure): - pass - -class call_pool_t(Structure): - pass - -class xlator_t(Structure): - def _getFirstChild(self): - return self.children[0].xlator - firstChild = property(_getFirstChild) - -class xlator_list_t(Structure): - pass - -class xlator_fops(Structure): - pass - -class xlator_mops(Structure): - pass - -class glusterfs_ctx_t(Structure): - pass - -class list_head(Structure): - pass - -class dict_t(Structure): - pass - -class inode_table_t(Structure): - pass - -class fd_t(Structure): - pass - -class iovec(Structure): - _fields_ = [ - ("iov_base", c_void_p), - ("iov_len", c_size_t), - ] - - def __init__(self, s): - self.iov_base = cast(c_char_p(s), c_void_p) - self.iov_len = len(s) - - def getBytes(self): - return string_at(self.iov_base, self.iov_len) - -# This is a pthread_spinlock_t -# TODO: what happens to volatile-ness? -gf_lock_t = c_int - -uid_t = c_uint32 -gid_t = c_uint32 -pid_t = c_int32 - -off_t = c_int64 - -# -# Function pointer types -# -ret_fn_t = CFUNCTYPE(c_int32, POINTER(call_frame_t), POINTER(call_frame_t), - POINTER(xlator_t), c_int32, c_int32) - -fini_fn_t = CFUNCTYPE(None, POINTER(xlator_t)) -init_fn_t = CFUNCTYPE(c_int32, POINTER(xlator_t)) -event_notify_fn_t = CFUNCTYPE(c_int32, POINTER(xlator_t), c_int32, c_void_p) - -list_head._fields_ = [ - ("next", POINTER(list_head)), - ("prev", POINTER(list_head)), - ] - -call_frame_t._fields_ = [ - ("root", POINTER(call_ctx_t)), - ("parent", POINTER(call_frame_t)), - ("next", POINTER(call_frame_t)), - ("prev", POINTER(call_frame_t)), - ("local", c_void_p), - ("this", POINTER(xlator_t)), - ("ret", ret_fn_t), - ("ref_count", c_int32), - ("lock", gf_lock_t), - ("cookie", c_void_p), - ("op", c_int32), - ("type", c_int8), - ] - -call_ctx_t._fields_ = [ - ("all_frames", list_head), - ("trans", c_void_p), - ("pool", call_pool_t), - ("unique", c_uint64), - ("state", c_void_p), - ("uid", uid_t), - ("gid", gid_t), - ("pid", pid_t), - ("frames", call_frame_t), - ("req_refs", POINTER(dict_t)), - ("rsp_refs", POINTER(dict_t)), - ] - -xlator_t._fields_ = [ - ("name", c_char_p), - ("type", c_char_p), - ("next", POINTER(xlator_t)), - ("prev", POINTER(xlator_t)), - ("parent", POINTER(xlator_t)), - ("children", POINTER(xlator_list_t)), - ("fops", POINTER(xlator_fops)), - ("mops", POINTER(xlator_mops)), - ("fini", fini_fn_t), - ("init", init_fn_t), - ("notify", event_notify_fn_t), - ("options", POINTER(dict_t)), - ("ctx", POINTER(glusterfs_ctx_t)), - ("itable", POINTER(inode_table_t)), - ("ready", c_char), - ("private", c_void_p), - ] - -xlator_list_t._fields_ = [ - ("xlator", POINTER(xlator_t)), - ("next", POINTER(xlator_list_t)), - ] - -fop_functions = collections.defaultdict(lambda: c_void_p) -fop_function_names = ['lookup', 'forget', 'stat', 'fstat', 'chmod', 'fchmod', - 'chown', 'fchown', 'truncate', 'ftruncate', 'utimens', 'access', - 'readlink', 'mknod', 'mkdir', 'unlink', 'rmdir', 'symlink', - 'rename', 'link', 'create', 'open', 'readv', 'writev', 'flush', - 'close', 'fsync', 'opendir', 'readdir', 'closedir', 'fsyncdir', - 'statfs', 'setxattr', 'getxattr', 'removexattr', 'lk', 'writedir', - # TODO: Call backs? - ] - -fop_writev_t = CFUNCTYPE(c_int32, POINTER(call_frame_t), POINTER(xlator_t), - POINTER(fd_t), POINTER(iovec), c_int32, - off_t) - -fop_functions['writev'] = fop_writev_t -xlator_fops._fields_ = [(f, fop_functions[f]) for f in fop_function_names] diff --git a/xlators/bindings/python/src/python.c b/xlators/bindings/python/src/python.c deleted file mode 100644 index 3310a2115..000000000 --- a/xlators/bindings/python/src/python.c +++ /dev/null @@ -1,232 +0,0 @@ -/* - Copyright (c) 2007-2010 Chris AtLee <chris@atlee.ca> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#include <Python.h> - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "xlator.h" -#include "logging.h" -#include "defaults.h" - -typedef struct -{ - char *scriptname; - PyObject *pXlator; - PyObject *pScriptModule; - PyObject *pGlusterModule; - PyThreadState *pInterp; - - PyObject *pFrameType, *pVectorType, *pFdType; -} python_private_t; - -int32_t -python_writev (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct iovec *vector, - int32_t count, - off_t offset) -{ - python_private_t *priv = (python_private_t *)this->private; - gf_log("python", GF_LOG_DEBUG, "In writev"); - if (PyObject_HasAttrString(priv->pXlator, "writev")) - { - - PyObject *retval = PyObject_CallMethod(priv->pXlator, "writev", - "O O O i l", - PyObject_CallMethod(priv->pFrameType, "from_address", "O&", PyLong_FromVoidPtr, frame), - PyObject_CallMethod(priv->pFdType, "from_address", "O&", PyLong_FromVoidPtr, fd), - PyObject_CallMethod(priv->pVectorType, "from_address", "O&", PyLong_FromVoidPtr, vector), - count, - offset); - if (PyErr_Occurred()) - { - PyErr_Print(); - } - Py_XDECREF(retval); - } - else - { - return default_writev(frame, this, fd, vector, count, offset); - } - return 0; -} - -struct xlator_fops fops = { - .writev = python_writev -}; - -static PyObject * -AnonModule_FromFile (const char* fname) -{ - // Get the builtins - PyThreadState* pThread = PyThreadState_Get(); - PyObject *pBuiltins = pThread->interp->builtins; - - if (PyErr_Occurred()) - { - PyErr_Print(); - return NULL; - } - - // Create a new dictionary for running code in - PyObject *pModuleDict = PyDict_New(); - PyDict_SetItemString(pModuleDict, "__builtins__", pBuiltins); - Py_INCREF(pBuiltins); - - // Run the file in the new context - FILE* fp = fopen(fname, "r"); - PyRun_File(fp, fname, Py_file_input, pModuleDict, pModuleDict); - fclose(fp); - if (PyErr_Occurred()) - { - PyErr_Print(); - Py_DECREF(pModuleDict); - Py_DECREF(pBuiltins); - return NULL; - } - - // Create an object to hold the new context - PyRun_String("class ModuleWrapper(object):\n\tpass\n", Py_single_input, pModuleDict, pModuleDict); - if (PyErr_Occurred()) - { - PyErr_Print(); - Py_DECREF(pModuleDict); - Py_DECREF(pBuiltins); - return NULL; - } - PyObject *pModule = PyRun_String("ModuleWrapper()", Py_eval_input, pModuleDict, pModuleDict); - if (PyErr_Occurred()) - { - PyErr_Print(); - Py_DECREF(pModuleDict); - Py_DECREF(pBuiltins); - Py_XDECREF(pModule); - return NULL; - } - - // Set the new context's dictionary to the one we used to run the code - // inside - PyObject_SetAttrString(pModule, "__dict__", pModuleDict); - if (PyErr_Occurred()) - { - PyErr_Print(); - Py_DECREF(pModuleDict); - Py_DECREF(pBuiltins); - Py_DECREF(pModule); - return NULL; - } - - return pModule; -} - -int32_t -init (xlator_t *this) -{ - // This is ok to call more than once per process - Py_InitializeEx(0); - - if (!this->children) { - gf_log ("python", GF_LOG_ERROR, - "FATAL: python should have exactly one child"); - return -1; - } - - python_private_t *priv = CALLOC (sizeof (python_private_t), 1); - ERR_ABORT (priv); - - data_t *scriptname = dict_get (this->options, "scriptname"); - if (scriptname) { - priv->scriptname = data_to_str(scriptname); - } else { - gf_log("python", GF_LOG_ERROR, - "FATAL: python requires the scriptname parameter"); - return -1; - } - - priv->pInterp = Py_NewInterpreter(); - - // Adjust python's path - PyObject *syspath = PySys_GetObject("path"); - PyObject *path = PyString_FromString(GLUSTER_PYTHON_PATH); - PyList_Append(syspath, path); - Py_DECREF(path); - - gf_log("python", GF_LOG_DEBUG, - "Loading gluster module"); - - priv->pGlusterModule = PyImport_ImportModule("gluster"); - if (PyErr_Occurred()) - { - PyErr_Print(); - return -1; - } - - priv->pFrameType = PyObject_GetAttrString(priv->pGlusterModule, "call_frame_t"); - priv->pFdType = PyObject_GetAttrString(priv->pGlusterModule, "fd_t"); - priv->pVectorType = PyObject_GetAttrString(priv->pGlusterModule, "iovec"); - - gf_log("python", GF_LOG_DEBUG, "Loading script...%s", priv->scriptname); - - priv->pScriptModule = AnonModule_FromFile(priv->scriptname); - if (!priv->pScriptModule || PyErr_Occurred()) - { - gf_log("python", GF_LOG_ERROR, "Error loading %s", priv->scriptname); - PyErr_Print(); - return -1; - } - - if (!PyObject_HasAttrString(priv->pScriptModule, "xlator")) - { - gf_log("python", GF_LOG_ERROR, "%s does not have a xlator attribute", priv->scriptname); - return -1; - } - gf_log("python", GF_LOG_DEBUG, "Instantiating translator"); - priv->pXlator = PyObject_CallMethod(priv->pScriptModule, "xlator", "O&", - PyLong_FromVoidPtr, this); - if (PyErr_Occurred() || !priv->pXlator) - { - PyErr_Print(); - return -1; - } - - this->private = priv; - - gf_log ("python", GF_LOG_DEBUG, "python xlator loaded"); - return 0; -} - -void -fini (xlator_t *this) -{ - python_private_t *priv = (python_private_t*)(this->private); - Py_DECREF(priv->pXlator); - Py_DECREF(priv->pScriptModule); - Py_DECREF(priv->pGlusterModule); - Py_DECREF(priv->pFrameType); - Py_DECREF(priv->pFdType); - Py_DECREF(priv->pVectorType); - Py_EndInterpreter(priv->pInterp); - return; -} diff --git a/xlators/bindings/python/src/testxlator.py b/xlators/bindings/python/src/testxlator.py deleted file mode 100644 index 507455c85..000000000 --- a/xlators/bindings/python/src/testxlator.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) 2007 Chris AtLee <chris@atlee.ca> -# This file is part of GlusterFS. -# -# GlusterFS is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published -# by the Free Software Foundation; either version 3 of the License, -# or (at your option) any later version. -# -# GlusterFS is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see -# <http://www.gnu.org/licenses/>. - -""" -This is a test translator written in python. - -Important things to note: - This file must be import-able from glusterfsd. This probably means - setting PYTHONPATH to where this file is located. - - This file must have a top-level xlator class object that will be - used to instantiate individual translators. -""" -from gluster import * - -class MyXlator(ComplexTranslator): - name = "MyXlator" - def writev_cbk(self, frame, cookie, op_ret, op_errno, buf): - stack_unwind(frame, op_ret, op_errno, buf) - return 0 - - def writev(self, frame, fd, vector, count, offset): - gf_log(self.name, GF_LOG_WARNING, "writev %i bytes", vector.iov_len) - # TODO: Use cookie to pass this to writev_cbk - old_count = vector.iov_len - - data = vector.getBytes().encode("zlib") - - vector = iovec(data) - gf_log(self.name, GF_LOG_WARNING, "writev %i bytes", vector.iov_len) - - @ret_fn_t - def rfn(frame, prev, this, op_ret, op_errno, *params): - if len(params) == 0: - params = [0] - return self.writev_cbk(frame, prev, old_count, op_errno, *params) - - stack_wind(frame, rfn, self.firstChild, - self.firstChild[0].fops[0].writev, fd, vector, count, offset) - return 0 - -xlator = MyXlator diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am index 0990822a7..6e883e565 100644 --- a/xlators/cluster/Makefile.am +++ b/xlators/cluster/Makefile.am @@ -1,3 +1,3 @@ -SUBDIRS = stripe afr dht +SUBDIRS = stripe afr dht nsr-server nsr-recon nsr-client CLEANFILES = diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am index e192b599b..ea5a90abb 100644 --- a/xlators/cluster/afr/src/Makefile.am +++ b/xlators/cluster/afr/src/Makefile.am @@ -1,27 +1,38 @@ xlator_LTLIBRARIES = afr.la pump.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster -afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c afr-self-heal-algorithm.c afr-lk-common.c $(top_builddir)/xlators/lib/src/libxlator.c +afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \ + afr-inode-write.c afr-open.c afr-transaction.c afr-lk-common.c \ + afr-read-txn.c \ + $(top_builddir)/xlators/lib/src/libxlator.c -afr_la_LDFLAGS = -module -avoidversion -afr_la_SOURCES = $(afr_common_source) afr.c +AFR_SELFHEAL_SOURCES = afr-self-heal-common.c afr-self-heal-data.c \ + afr-self-heal-entry.c afr-self-heal-metadata.c afr-self-heald.c \ + afr-self-heal-name.c + +afr_la_LDFLAGS = -module -avoid-version +afr_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) afr.c afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -pump_la_LDFLAGS = -module -avoidversion -pump_la_SOURCES = $(afr_common_source) pump.c +pump_la_LDFLAGS = -module -avoid-version +pump_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) pump.c pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h afr-self-heal-algorithm.h pump.h afr-mem-types.h afr-common.c $(top_builddir)/xlators/lib/src/libxlator.h +noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h \ + afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-mem-types.h \ + afr-common.c afr-self-heald.h pump.h \ + $(top_builddir)/xlators/lib/src/libxlator.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) \ + -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ + -I$(top_srcdir)/rpc/rpc-lib/src -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/contrib/md5 -shared -nostartfiles $(GF_CFLAGS) \ - -I$(top_srcdir)/xlators/lib/src +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = uninstall-local: rm -f $(DESTDIR)$(xlatordir)/replicate.so - rm -f $(DESTDIR)$(xlatordir)/pump.so install-data-hook: ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 2b073eb9c..164a651ba 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -44,6 +35,7 @@ #include "compat.h" #include "byte-order.h" #include "statedump.h" +#include "inode.h" #include "fd.h" @@ -53,323 +45,875 @@ #include "afr-dir-write.h" #include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" -#include "pump.h" +#include "afr-self-heald.h" -#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000200000000ULL -#define AFR_ICTX_SPLIT_BRAIN_MASK 0x0000000100000000ULL -#define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL -int32_t -afr_set_dict_gfid (dict_t *dict, uuid_t gfid) +call_frame_t * +afr_copy_frame (call_frame_t *base) { - int ret = 0; + afr_local_t *local = NULL; + call_frame_t *frame = NULL; + int op_errno = 0; + + frame = copy_frame (base); + if (!frame) + return NULL; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) { + AFR_STACK_DESTROY (frame); + return NULL; + } + + return frame; +} - GF_ASSERT (gfid); +/* + * INODE CTX 64-bit VALUE FORMAT FOR SMALL (<= 16) SUBVOL COUNTS: + * + * |<---------- 64bit ------------>| + * 63 32 31 16 15 0 + * | EVENT_GEN | DATA | METADATA | + * + * + * METADATA (bit-0 .. bit-15): bitmap representing subvolumes from which + * metadata can be attempted to be read. + * + * bit-0 => priv->subvolumes[0] + * bit-1 => priv->subvolumes[1] + * ... etc. till bit-15 + * + * DATA (bit-16 .. bit-31): bitmap representing subvolumes from which data + * can be attempted to be read. + * + * bit-16 => priv->subvolumes[0] + * bit-17 => priv->subvolumes[1] + * ... etc. till bit-31 + * + * EVENT_GEN (bit-32 .. bit-63): event generation (i.e priv->event_generation) + * when DATA and METADATA was last updated. + * + * If EVENT_GEN is < priv->event_generation, + * or is 0, it means afr_inode_refresh() needs + * to be called to recalculate the bitmaps. + */ - ret = dict_set_static_bin (dict, "gfid-req", gfid, 16); - if (ret) - gf_log (THIS->name, GF_LOG_DEBUG, "gfid set failed"); +int +__afr_inode_read_subvol_get_small (inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int *event_p) +{ + afr_private_t *priv = NULL; + int ret = -1; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint32_t event = 0; + uint64_t val = 0; + int i = 0; + + priv = this->private; + + ret = __inode_ctx_get (inode, this, &val); + if (ret < 0) + return ret; + + metadatamap = (val & 0x000000000000ffff); + datamap = (val & 0x00000000ffff0000) >> 16; + event = (val & 0xffffffff00000000) >> 32; + + for (i = 0; i < priv->child_count; i++) { + if (metadata) + metadata[i] = (metadatamap >> i) & 1; + if (data) + data[i] = (datamap >> i) & 1; + } + + if (event_p) + *event_p = event; + return ret; +} - return ret; + +int +__afr_inode_read_subvol_set_small (inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int event) +{ + afr_private_t *priv = NULL; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint64_t val = 0; + int i = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (data[i]) + datamap |= (1 << i); + if (metadata[i]) + metadatamap |= (1 << i); + } + + val = ((uint64_t) metadatamap) | + (((uint64_t) datamap) << 16) | + (((uint64_t) event) << 32); + + return __inode_ctx_set (inode, this, &val); } -uint64_t -afr_is_split_brain (xlator_t *this, inode_t *inode) + +int +__afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this) { - int ret = 0; + int ret = -1; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint32_t event = 0; + uint64_t val = 0; - uint64_t ctx = 0; - uint64_t split_brain = 0; + ret = __inode_ctx_get (inode, this, &val); + (void) ret; - VALIDATE_OR_GOTO (inode, out); + metadatamap = (val & 0x000000000000ffff) >> 0; + datamap = (val & 0x00000000ffff0000) >> 16; + event = 0; - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); + val = ((uint64_t) metadatamap) | + (((uint64_t) datamap) << 16) | + (((uint64_t) event) << 32); - if (ret < 0) - goto unlock; + return __inode_ctx_set (inode, this, &val); +} - split_brain = ctx & AFR_ICTX_SPLIT_BRAIN_MASK; - } -unlock: - UNLOCK (&inode->lock); -out: - return split_brain; +int +__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int *event_p) +{ + afr_private_t *priv = NULL; + int ret = -1; + + priv = this->private; + + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_get_small (inode, this, data, + metadata, event_p); + else + /* TBD: allocate structure with array and read from it */ + ret = -1; + + return ret; } -void -afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set) +int +__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int event) { - uint64_t ctx = 0; - int ret = 0; + afr_private_t *priv = NULL; + int ret = -1; - VALIDATE_OR_GOTO (inode, out); + priv = this->private; - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_set_small (inode, this, data, + metadata, event); + else + ret = -1; - if (ret < 0) { - ctx = 0; - } + return ret; +} - if (set) { - ctx = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx) - | (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_SPLIT_BRAIN_MASK); - } else { - ctx = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx); - } - ret = __inode_ctx_put (inode, this, ctx); - if (ret) { - gf_log_callingfn (this->name, GF_LOG_INFO, - "failed to set the inode ctx (%s)", - uuid_utoa (inode->gfid)); - } - } - UNLOCK (&inode->lock); -out: - return; +int +__afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this) +{ + afr_private_t *priv = NULL; + int ret = -1; + + priv = this->private; + + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_reset_small (inode, this); + else + ret = -1; + + return ret; } -uint64_t -afr_is_opendir_done (xlator_t *this, inode_t *inode) +int +afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int *event_p) { - int ret = 0; - uint64_t ctx = 0; - uint64_t opendir_done = 0; + int ret = -1; - VALIDATE_OR_GOTO (inode, out); + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_get (inode, this, data, + metadata, event_p); + } + UNLOCK(&inode->lock); - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); + return ret; +} - if (ret < 0) - goto unlock; - opendir_done = ctx & AFR_ICTX_OPENDIR_DONE_MASK; - } -unlock: - UNLOCK (&inode->lock); +int +afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int event) +{ + int ret = -1; -out: - return opendir_done; + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_set (inode, this, data, metadata, + event); + } + UNLOCK(&inode->lock); + + return ret; } -void -afr_set_opendir_done (xlator_t *this, inode_t *inode) +int +afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this) { - uint64_t ctx = 0; - int ret = 0; + int ret = -1; - VALIDATE_OR_GOTO (inode, out); + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_reset (inode, this); + } + UNLOCK(&inode->lock); - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); + return ret; +} - if (ret < 0) { - ctx = 0; - } - ctx = (~AFR_ICTX_OPENDIR_DONE_MASK & ctx) - | (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_OPENDIR_DONE_MASK); +int +afr_accused_fill (xlator_t *this, dict_t *xdata, unsigned char *accused, + afr_transaction_type type) +{ + afr_private_t *priv = NULL; + int i = 0; + int idx = afr_index_for_transaction_type (type); + void *pending_raw = NULL; + int pending[3]; + int ret = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + ret = dict_get_ptr (xdata, priv->pending_key[i], + &pending_raw); + if (ret) /* no pending flags */ + continue; + memcpy (pending, pending_raw, sizeof(pending)); + + if (ntoh32 (pending[idx])) + accused[i] = 1; + } + + return 0; +} - ret = __inode_ctx_put (inode, this, ctx); - if (ret) { - gf_log_callingfn (this->name, GF_LOG_INFO, - "failed to set the inode ctx (%s)", - uuid_utoa (inode->gfid)); - } - } - UNLOCK (&inode->lock); -out: - return; + +int +afr_accuse_smallfiles (xlator_t *this, struct afr_reply *replies, + unsigned char *data_accused) +{ + int i = 0; + afr_private_t *priv = NULL; + uint64_t maxsize = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) + continue; + if (replies[i].poststat.ia_size > maxsize) + maxsize = replies[i].poststat.ia_size; + } + + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) + continue; + if (replies[i].poststat.ia_size < maxsize) + data_accused[i] = 1; + } + + return 0; } -uint64_t -afr_read_child (xlator_t *this, inode_t *inode) +int +afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode) { - int ret = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int event_generation = 0; + int i = 0; + unsigned char *data_accused = NULL; + unsigned char *metadata_accused = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + int ret = 0; + + local = frame->local; + priv = this->private; + replies = local->replies; + event_generation = local->event_generation; + + data_accused = alloca0 (priv->child_count); + data_readable = alloca0 (priv->child_count); + metadata_accused = alloca0 (priv->child_count); + metadata_readable = alloca0 (priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + data_readable[i] = 1; + metadata_readable[i] = 1; + } + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) { + data_readable[i] = 0; + metadata_readable[i] = 0; + continue; + } + + if (replies[i].op_ret == -1) { + data_readable[i] = 0; + metadata_readable[i] = 0; + continue; + } + + afr_accused_fill (this, replies[i].xdata, data_accused, + (inode->ia_type == IA_IFDIR) ? + AFR_ENTRY_TRANSACTION : AFR_DATA_TRANSACTION); + + afr_accused_fill (this, replies[i].xdata, + metadata_accused, AFR_METADATA_TRANSACTION); + + } + + if (inode->ia_type != IA_IFDIR) + afr_accuse_smallfiles (this, replies, data_accused); + + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) { + data_readable[i] = 0; + ret = 1; + } + if (metadata_accused[i]) { + metadata_readable[i] = 0; + ret = 1; + } + } + + afr_inode_read_subvol_set (inode, this, data_readable, + metadata_readable, event_generation); + return ret; +} - uint64_t ctx = 0; - uint64_t read_child = 0; - VALIDATE_OR_GOTO (inode, out); - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); +int +afr_refresh_selfheal_done (int ret, call_frame_t *heal, void *opaque) +{ + if (heal) + STACK_DESTROY (heal->root); + return 0; +} - if (ret < 0) - goto unlock; +int +afr_inode_refresh_err (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int err = 0; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && !local->replies[i].op_ret) { + err = 0; + goto ret; + } + } + + err = afr_final_errno (local, priv); +ret: + return -err; +} - read_child = ctx & AFR_ICTX_READ_CHILD_MASK; - } -unlock: - UNLOCK (&inode->lock); -out: - return read_child; +int +afr_refresh_selfheal_wrap (void *opaque) +{ + call_frame_t *frame = opaque; + afr_local_t *local = NULL; + xlator_t *this = NULL; + int err = 0; + + local = frame->local; + this = frame->this; + + afr_selfheal (frame->this, local->refreshinode->gfid); + + afr_selfheal_unlocked_discover (frame, local->refreshinode, + local->refreshinode->gfid, + local->replies); + + afr_replies_interpret (frame, this, local->refreshinode); + + err = afr_inode_refresh_err (frame, this); + + afr_replies_wipe (local, this->private); + + local->refreshfn (frame, this, err); + + return 0; } -void -afr_set_read_child (xlator_t *this, inode_t *inode, int32_t read_child) +gf_boolean_t +afr_selfheal_enabled (xlator_t *this) { - uint64_t ctx = 0; - int ret = 0; + afr_private_t *priv = NULL; + gf_boolean_t data = _gf_false; - VALIDATE_OR_GOTO (inode, out); + priv = this->private; - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); + gf_string2boolean (priv->data_self_heal, &data); - if (ret < 0) { - ctx = 0; - } + return data || priv->metadata_self_heal || priv->entry_self_heal; +} - ctx = (~AFR_ICTX_READ_CHILD_MASK & ctx) - | (AFR_ICTX_READ_CHILD_MASK & read_child); - ret = __inode_ctx_put (inode, this, ctx); - if (ret) { - gf_log_callingfn (this->name, GF_LOG_INFO, - "failed to set the inode ctx (%s)", - uuid_utoa (inode->gfid)); - } - } - UNLOCK (&inode->lock); -out: - return; +int +afr_inode_refresh_done (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *heal = NULL; + afr_local_t *local = NULL; + int ret = 0; + int err = 0; + + local = frame->local; + + ret = afr_replies_interpret (frame, this, local->refreshinode); + + err = afr_inode_refresh_err (frame, this); + + afr_replies_wipe (local, this->private); + + if (ret && afr_selfheal_enabled (this)) { + heal = copy_frame (frame); + if (heal) + heal->root->pid = -1; + ret = synctask_new (this->ctx->env, afr_refresh_selfheal_wrap, + afr_refresh_selfheal_done, heal, frame); + if (ret) + goto refresh_done; + } else { + refresh_done: + local->refreshfn (frame, this, err); + } + + return 0; } -/** - * afr_local_cleanup - cleanup everything in frame->local - */ +int +afr_inode_refresh_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *par) +{ + afr_local_t *local = NULL; + int call_child = (long) cookie; + int call_count = 0; -void -afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) + local = frame->local; + + local->replies[call_child].valid = 1; + local->replies[call_child].op_ret = op_ret; + local->replies[call_child].op_errno = op_errno; + if (op_ret != -1) { + local->replies[call_child].poststat = *buf; + local->replies[call_child].postparent = *par; + local->replies[call_child].xdata = dict_ref (xdata); + } + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_inode_refresh_done (frame, this); + + return 0; +} + + +int +afr_inode_refresh_subvol (call_frame_t *frame, xlator_t *this, int i, + inode_t *inode, dict_t *xdata) { - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; + loc_t loc = {0, }; + afr_private_t *priv = NULL; + priv = this->private; - sh = &local->self_heal; - priv = this->private; + loc.inode = inode; + uuid_copy (loc.gfid, inode->gfid); - if (sh->buf) - GF_FREE (sh->buf); + STACK_WIND_COOKIE (frame, afr_inode_refresh_subvol_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->lookup, &loc, xdata); + return 0; +} - if (sh->xattr) { - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } - } - GF_FREE (sh->xattr); + +int +afr_inode_refresh_do (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t *xdata = NULL; + + priv = this->private; + local = frame->local; + + afr_replies_wipe (local, priv); + + xdata = dict_new (); + if (!xdata) { + afr_inode_refresh_done (frame, this); + return 0; + } + + if (afr_xattr_req_prepare (this, xdata) != 0) { + dict_unref (xdata); + afr_inode_refresh_done (frame, this); + return 0; + } + + local->call_count = AFR_COUNT (local->child_up, priv->child_count); + + call_count = local->call_count; + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + + afr_inode_refresh_subvol (frame, this, i, local->refreshinode, + xdata); + + if (!--call_count) + break; + } + + dict_unref (xdata); + + return 0; +} + + +int +afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_inode_refresh_cbk_t refreshfn) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->refreshfn = refreshfn; + + if (local->refreshinode) { + inode_unref (local->refreshinode); + local->refreshinode = NULL; + } + + local->refreshinode = inode_ref (inode); + + afr_inode_refresh_do (frame, this); + + return 0; +} + + +int +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req) +{ + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + ret = dict_set_uint64 (xattr_req, priv->pending_key[i], + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret < 0) + gf_log (this->name, GF_LOG_WARNING, + "Unable to set dict value for %s", + priv->pending_key[i]); + /* 3 = data+metadata+entry */ + } + ret = dict_set_uint64 (xattr_req, AFR_DIRTY, + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "failed to set dirty " + "query flag"); } - if (sh->child_errno) - GF_FREE (sh->child_errno); + return ret; +} - if (sh->pending_matrix) { - for (i = 0; i < priv->child_count; i++) { - GF_FREE (sh->pending_matrix[i]); - } - GF_FREE (sh->pending_matrix); +int +afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this, + dict_t *xattr_req, loc_t *loc) +{ + int ret = -ENOMEM; + + local->xattr_req = dict_new (); + if (!local->xattr_req) + goto out; + if (xattr_req) + dict_copy (xattr_req, local->xattr_req); + + ret = afr_xattr_req_prepare (this, local->xattr_req); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to prepare xattr_req", loc->path); + } + + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_INODELK_COUNT); + } + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_ENTRYLK_COUNT); } - if (sh->delta_matrix) { - for (i = 0; i < priv->child_count; i++) { - GF_FREE (sh->delta_matrix[i]); - } - GF_FREE (sh->delta_matrix); + ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_PARENT_ENTRYLK); } - if (sh->sources) - GF_FREE (sh->sources); + ret = 0; +out: + return ret; +} + + +int +afr_hash_child (inode_t *inode, int32_t child_count, int hashmode) +{ + uuid_t gfid_copy = {0,}; + pid_t pid; - if (sh->success) - GF_FREE (sh->success); + if (!hashmode) { + return -1; + } - if (sh->locked_nodes) - GF_FREE (sh->locked_nodes); + if (inode) { + uuid_copy (gfid_copy, inode->gfid); + } - if (sh->healing_fd && !sh->healing_fd_opened) { - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; + if (hashmode > 1) { + /* + * Why getpid? Because it's one of the cheapest calls + * available - faster than gethostname etc. - and returns a + * constant-length value that's sure to be shorter than a UUID. + * It's still very unlikely to be the same across clients, so + * it still provides good mixing. We're not trying for + * perfection here. All we need is a low probability that + * multiple clients won't converge on the same subvolume. + */ + pid = getpid(); + memcpy (gfid_copy, &pid, sizeof(pid)); } - if (sh->linkname) - GF_FREE ((char *)sh->linkname); + return SuperFastHash((char *)gfid_copy, + sizeof(gfid_copy)) % child_count; +} + + +int +afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, + unsigned char *readable) +{ + afr_private_t *priv = NULL; + int read_subvol = -1; + int i = 0; + + priv = this->private; + + /* first preference - explicitly specified or local subvolume */ + if (priv->read_child >= 0 && readable[priv->read_child]) + return priv->read_child; + + /* second preference - use hashed mode */ + read_subvol = afr_hash_child (inode, priv->child_count, + priv->hash_mode); + if (read_subvol >= 0 && readable[read_subvol]) + return read_subvol; + + for (i = 0; i < priv->child_count; i++) { + if (readable[i]) + return i; + } + + /* no readable subvolumes, either split brain or all subvols down */ + + return -1; +} + + +int +afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, + int type) +{ + int ret = -1; + + if (type == AFR_METADATA_TRANSACTION) + ret = afr_inode_read_subvol_get (inode, this, 0, readable, + event_p); + else + ret = afr_inode_read_subvol_get (inode, this, readable, 0, + event_p); + return ret; +} + - loc_wipe (&sh->parent_loc); +int +afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p, + int *event_p, afr_transaction_type type) +{ + afr_private_t *priv = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + unsigned char *readable = NULL; + unsigned char *intersection = NULL; + int subvol = -1; + int event = 0; + + priv = this->private; + + readable = alloca0 (priv->child_count); + data_readable = alloca0 (priv->child_count); + metadata_readable = alloca0 (priv->child_count); + intersection = alloca0 (priv->child_count); + + afr_inode_read_subvol_type_get (inode, this, readable, &event, type); + + afr_inode_read_subvol_get (inode, this, data_readable, metadata_readable, + &event); + + AFR_INTERSECT (intersection, data_readable, metadata_readable, + priv->child_count); + + if (AFR_COUNT (intersection, priv->child_count) > 0) + subvol = afr_read_subvol_select_by_policy (inode, this, + intersection); + else + subvol = afr_read_subvol_select_by_policy (inode, this, + readable); + if (subvol_p) + *subvol_p = subvol; + if (event_p) + *event_p = event; + return subvol; } void afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) { - int i = 0; - afr_private_t * priv = NULL; + afr_private_t *priv = NULL; + int i = 0; priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (local->pending && local->pending[i]) - GF_FREE (local->pending[i]); - } - - GF_FREE (local->pending); - - if (local->internal_lock.locked_nodes) - GF_FREE (local->internal_lock.locked_nodes); + afr_matrix_cleanup (local->pending, priv->child_count); - if (local->internal_lock.inode_locked_nodes) - GF_FREE (local->internal_lock.inode_locked_nodes); + GF_FREE (local->internal_lock.locked_nodes); - if (local->internal_lock.entry_locked_nodes) - GF_FREE (local->internal_lock.entry_locked_nodes); + for (i = 0; local->internal_lock.inodelk[i].domain; i++) { + GF_FREE (local->internal_lock.inodelk[i].locked_nodes); + } - if (local->internal_lock.lower_locked_nodes) - GF_FREE (local->internal_lock.lower_locked_nodes); + GF_FREE (local->internal_lock.lower_locked_nodes); + afr_entry_lockee_cleanup (&local->internal_lock); - GF_FREE (local->transaction.child_errno); - GF_FREE (local->child_errno); + GF_FREE (local->transaction.pre_op); + GF_FREE (local->transaction.eager_lock); + GF_FREE (local->transaction.fop_subvols); + GF_FREE (local->transaction.failed_subvols); GF_FREE (local->transaction.basename); GF_FREE (local->transaction.new_basename); loc_wipe (&local->transaction.parent_loc); loc_wipe (&local->transaction.new_parent_loc); + } void +afr_replies_wipe (afr_local_t *local, afr_private_t *priv) +{ + int i; + + if (!local->replies) + return; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].xdata) { + dict_unref (local->replies[i].xdata); + local->replies[i].xdata = NULL; + } + } + + memset (local->replies, 0, sizeof(*local->replies) * priv->child_count); +} + +void +afr_remove_eager_lock_stub (afr_local_t *local) +{ + LOCK (&local->fd->lock); + { + list_del_init (&local->transaction.eager_locked); + } + UNLOCK (&local->fd->lock); +} + +void afr_local_cleanup (afr_local_t *local, xlator_t *this) { - int i = 0; afr_private_t * priv = NULL; if (!local) return; - afr_local_sh_cleanup (local, this); + syncbarrier_destroy (&local->barrier); + + if (local->transaction.eager_lock_on && + !list_empty (&local->transaction.eager_locked)) + afr_remove_eager_lock_stub (local); afr_local_transaction_cleanup (local, this); @@ -384,37 +928,36 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (local->xattr_req) dict_unref (local->xattr_req); + if (local->dict) + dict_unref (local->dict); + + afr_replies_wipe (local, priv); + GF_FREE(local->replies); + GF_FREE (local->child_up); - { /* lookup */ - if (local->cont.lookup.xattrs) { - for (i = 0; i < priv->child_count; i++) { - if (local->cont.lookup.xattrs[i]) { - dict_unref (local->cont.lookup.xattrs[i]); - local->cont.lookup.xattrs[i] = NULL; - } - } - GF_FREE (local->cont.lookup.xattrs); - local->cont.lookup.xattrs = NULL; - } + GF_FREE (local->read_attempted); - if (local->cont.lookup.xattr) { - dict_unref (local->cont.lookup.xattr); - } + GF_FREE (local->readable); - if (local->cont.lookup.inode) { - inode_unref (local->cont.lookup.inode); - } - } + if (local->inode) + inode_unref (local->inode); + + if (local->parent) + inode_unref (local->parent); + + if (local->parent2) + inode_unref (local->parent2); + + if (local->refreshinode) + inode_unref (local->refreshinode); { /* getxattr */ - if (local->cont.getxattr.name) - GF_FREE (local->cont.getxattr.name); + GF_FREE (local->cont.getxattr.name); } { /* lk */ - if (local->cont.lk.locked_nodes) - GF_FREE (local->cont.lk.locked_nodes); + GF_FREE (local->cont.lk.locked_nodes); } { /* create */ @@ -441,6 +984,8 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) { /* writev */ GF_FREE (local->cont.writev.vector); + if (local->cont.writev.iobref) + iobref_unref (local->cont.writev.iobref); } { /* setxattr */ @@ -448,18 +993,40 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) dict_unref (local->cont.setxattr.dict); } + { /* fsetxattr */ + if (local->cont.fsetxattr.dict) + dict_unref (local->cont.fsetxattr.dict); + } + { /* removexattr */ GF_FREE (local->cont.removexattr.name); } - + { /* xattrop */ + if (local->cont.xattrop.xattr) + dict_unref (local->cont.xattrop.xattr); + } + { /* fxattrop */ + if (local->cont.fxattrop.xattr) + dict_unref (local->cont.fxattrop.xattr); + } { /* symlink */ GF_FREE (local->cont.symlink.linkpath); } { /* opendir */ - if (local->cont.opendir.checksum) - GF_FREE (local->cont.opendir.checksum); + GF_FREE (local->cont.opendir.checksum); + } + + { /* readdirp */ + if (local->cont.readdir.dict) + dict_unref (local->cont.readdir.dict); } + + if (local->xdata_req) + dict_unref (local->xdata_req); + + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); } @@ -481,929 +1048,1060 @@ afr_frame_return (call_frame_t *frame) } -/** - * up_children_count - return the number of children that are up - */ - -int -afr_up_children_count (int child_count, unsigned char *child_up) +gf_boolean_t +afr_is_entry_possibly_under_txn (afr_local_t *local, xlator_t *this) { - int i = 0; - int ret = 0; - - for (i = 0; i < child_count; i++) - if (child_up[i]) - ret++; - return ret; + int i = 0; + int tmp = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].xdata) + continue; + if (dict_get_int32 (local->replies[i].xdata, + GLUSTERFS_PARENT_ENTRYLK, + &tmp) == 0) + if (tmp) + return _gf_true; + } + + return _gf_false; } -void -afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) -{ - GF_ASSERT (loc); - GF_ASSERT (buf); - uuid_copy (loc->gfid, buf->ia_gfid); - if (postparent) - uuid_copy (loc->pargfid, postparent->ia_gfid); +/* + * Quota size xattrs are not maintained by afr. There is a + * possibility that they differ even when both the directory changelog xattrs + * suggest everything is fine. So if there is at least one 'source' check among + * the sources which has the maximum quota size. Otherwise check among all the + * available ones for maximum quota size. This way if there is a source and + * stale copies it always votes for the 'source'. + * */ + +static void +afr_handle_quota_size (call_frame_t *frame, xlator_t *this) +{ + unsigned char *readable = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int i = 0; + uint64_t size = 0; + uint64_t max_size = 0; + int readable_cnt = 0; + + local = frame->local; + priv = this->private; + replies = local->replies; + + readable = alloca0 (priv->child_count); + + afr_inode_read_subvol_get (local->inode, this, readable, 0, 0); + + readable_cnt = AFR_COUNT (readable, priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (readable_cnt && !readable[i]) + continue; + if (!replies[i].xdata) + continue; + if (dict_get_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, &size)) + continue; + if (size > max_size) + max_size = size; + } + + if (!max_size) + return; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (readable_cnt && !readable[i]) + continue; + if (!replies[i].xdata) + continue; + if (dict_set_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, max_size)) + continue; + } } -ino64_t -afr_itransform (ino64_t ino, int child_count, int child_index) + +static void +afr_lookup_done (call_frame_t *frame, xlator_t *this) { - ino64_t scaled_ino = -1; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = -1; + int op_errno = 0; + int read_subvol = 0; + unsigned char *readable = NULL; + int event = 0; + struct afr_reply *replies = NULL; + uuid_t read_gfid = {0, }; + gf_boolean_t locked_entry = _gf_false; + gf_boolean_t can_interpret = _gf_true; - if (ino == ((uint64_t) -1)) { - scaled_ino = ((uint64_t) -1); - goto out; - } + priv = this->private; + local = frame->local; + replies = local->replies; - scaled_ino = (ino * child_count) + child_index; + locked_entry = afr_is_entry_possibly_under_txn (local, this); -out: - return scaled_ino; -} + readable = alloca0 (priv->child_count); + afr_inode_read_subvol_get (local->loc.parent, this, readable, + NULL, &event); -int -afr_deitransform_orig (ino64_t ino, int child_count) -{ - int index = -1; + /* First, check if we have an ESTALE from somewhere, + If so, propagate that so that a revalidate can be + issued + */ + op_errno = afr_final_errno (frame->local, this->private); + local->op_errno = op_errno; + if (op_errno == ESTALE) { + local->op_errno = op_errno; + local->op_ret = -1; + goto unwind; + } + + read_subvol = -1; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (locked_entry && replies[i].op_ret == -1 && + replies[i].op_errno == ENOENT) { + /* Second, check entry is still + "underway" in creation */ + local->op_ret = -1; + local->op_errno = ENOENT; + read_subvol = i; + goto unwind; + } + + if (replies[i].op_ret == -1) + continue; + + if (read_subvol == -1 || !readable[read_subvol]) { + read_subvol = i; + uuid_copy (read_gfid, replies[i].poststat.ia_gfid); + local->op_ret = 0; + } + } + + if (read_subvol == -1) + goto unwind; + /* We now have a read_subvol, which is readable[] (if there + were any). Next we look for GFID mismatches. We don't + consider a GFID mismatch as an error if read_subvol is + readable[] but the mismatching GFID subvol is not. + */ + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) { + if (priv->child_up[i]) + can_interpret = _gf_false; + continue; + } + + if (!uuid_compare (replies[i].poststat.ia_gfid, + read_gfid)) + continue; + + can_interpret = _gf_false; + + if (locked_entry) + continue; + + /* Now GFIDs mismatch. It's OK as long as this subvol + is not readable[] but read_subvol is */ + if (readable[read_subvol] && !readable[i]) + continue; + + /* LOG ERROR */ + local->op_ret = -1; + local->op_errno = EIO; + goto unwind; + } + + /* Forth, for the finalized GFID, pick the best subvolume + to return stats from. + */ + if (can_interpret) { + /* It is safe to call afr_replies_interpret() because we have + a response from all the UP subvolumes and all of them resolved + to the same GFID + */ + if (afr_replies_interpret (frame, this, local->inode)) { + read_subvol = afr_data_subvol_get (local->inode, this, + 0, 0); + afr_inode_read_subvol_reset (local->inode, this); + goto cant_interpret; + } else { + read_subvol = afr_data_subvol_get (local->inode, this, + 0, 0); + } + } else { + cant_interpret: + if (read_subvol == -1) + dict_del (replies[0].xdata, GF_CONTENT_KEY); + else + dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY); + } + + afr_handle_quota_size (frame, this); - index = ino % child_count; +unwind: + if (read_subvol == -1) + read_subvol = 0; - return index; + AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->replies[read_subvol].poststat, + local->replies[read_subvol].xdata, + &local->replies[read_subvol].postparent); } +/* + * During a lookup, some errors are more "important" than + * others in that they must be given higher priority while + * returning to the user. + * + * The hierarchy is ESTALE > ENOENT > others + */ int -afr_deitransform (ino64_t ino, int child_count) +afr_higher_errno (int32_t old_errno, int32_t new_errno) { - return 0; + if (old_errno == ENODATA || new_errno == ENODATA) + return ENODATA; + if (old_errno == ESTALE || new_errno == ESTALE) + return ESTALE; + if (old_errno == ENOENT || new_errno == ENOENT) + return ENOENT; + + return new_errno; } int -afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this) +afr_final_errno (afr_local_t *local, afr_private_t *priv) { - afr_local_t *local = NULL; + int i = 0; + int op_errno = 0; + int tmp_errno = 0; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret == 0) + continue; + tmp_errno = local->replies[i].op_errno; + op_errno = afr_higher_errno (op_errno, tmp_errno); + } + + return op_errno; +} - local = frame->local; +static int +get_pathinfo_host (char *pathinfo, char *hostname, size_t size) +{ + char *start = NULL; + char *end = NULL; + int ret = -1; + int i = 0; - if (local->govinda_gOvinda && local->cont.lookup.inode) { - afr_set_split_brain (this, local->cont.lookup.inode, _gf_true); - } + if (!pathinfo) + goto out; - AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, - local->cont.lookup.inode, - &local->cont.lookup.buf, - local->cont.lookup.xattr, - &local->cont.lookup.postparent); + start = strchr (pathinfo, ':'); + if (!start) + goto out; + end = strrchr (pathinfo, ':'); + if (start == end) + goto out; - return 0; + memset (hostname, 0, size); + i = 0; + while (++start != end) + hostname[i++] = *start; + ret = 0; +out: + return ret; } - -static void -afr_lookup_collect_xattr (afr_local_t *local, xlator_t *this, - int child_index, dict_t *xattr) +int +afr_local_pathinfo (char *pathinfo, gf_boolean_t *local) { - uint32_t inodelk_count = 0; - uint32_t entrylk_count = 0; - int ret = 0; - - if (afr_sh_has_metadata_pending (xattr, child_index, this)) { - local->self_heal.need_metadata_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "metadata self-heal is pending for %s.", - local->loc.path); - } - - if (afr_sh_has_entry_pending (xattr, child_index, this)) { - local->self_heal.need_entry_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "entry self-heal is pending for %s.", local->loc.path); + int ret = 0; + char pathinfohost[1024] = {0}; + char localhost[1024] = {0}; + xlator_t *this = THIS; + + *local = _gf_false; + ret = get_pathinfo_host (pathinfo, pathinfohost, sizeof (pathinfohost)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Invalid pathinfo: %s", + pathinfo); + goto out; } - if (afr_sh_has_data_pending (xattr, child_index, this)) { - local->self_heal.need_data_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "data self-heal is pending for %s.", local->loc.path); + ret = gethostname (localhost, sizeof (localhost)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "gethostname() failed, " + "reason: %s", strerror (errno)); + goto out; } - ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT, - &inodelk_count); - if (ret == 0) - local->inodelk_count += inodelk_count; - - ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT, - &entrylk_count); - if (ret == 0) - local->entrylk_count += entrylk_count; + if (!strcmp (localhost, pathinfohost)) + *local = _gf_true; +out: + return ret; } - -static void -afr_lookup_self_heal_check (xlator_t *this, afr_local_t *local, - struct iatt *buf, struct iatt *lookup_buf) +static int32_t +afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - if (FILETYPE_DIFFERS (buf, lookup_buf)) { - /* mismatching filetypes with same name - */ + int ret = 0; + char *pathinfo = NULL; + gf_boolean_t is_local = _gf_false; + afr_private_t *priv = NULL; + int32_t child_index = -1; - gf_log (this->name, GF_LOG_INFO, - "filetype differs for %s ", local->loc.path); - - local->govinda_gOvinda = 1; + if (op_ret != 0) { + goto out; } - if (PERMISSION_DIFFERS (buf, lookup_buf)) { - /* mismatching permissions */ - gf_log (this->name, GF_LOG_INFO, - "permissions differ for %s ", local->loc.path); - local->self_heal.need_metadata_self_heal = _gf_true; - } + priv = this->private; + child_index = (int32_t)(long)cookie; - if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { - /* mismatching permissions */ - local->self_heal.need_metadata_self_heal = _gf_true; - gf_log (this->name, GF_LOG_INFO, - "ownership differs for %s ", local->loc.path); + ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); + if (ret != 0) { + goto out; } - if (SIZE_DIFFERS (buf, lookup_buf) - && IA_ISREG (buf->ia_type)) { - gf_log (this->name, GF_LOG_INFO, - "size differs for %s ", local->loc.path); - local->self_heal.need_data_self_heal = _gf_true; + ret = afr_local_pathinfo (pathinfo, &is_local); + if (ret) { + goto out; } - if (uuid_compare (buf->ia_gfid, lookup_buf->ia_gfid)) { - /* mismatching gfid */ - gf_log (this->name, GF_LOG_WARNING, - "%s: gfid different on subvolume", local->loc.path); + /* + * Note that one local subvolume will override another here. The only + * way to avoid that would be to retain extra information about whether + * the previous read_child is local, and it's just not worth it. Even + * the slowest local subvolume is far preferable to a remote one. + */ + if (is_local) { + gf_log (this->name, GF_LOG_INFO, + "selecting local read_child %s", + priv->children[child_index]->name); + priv->read_child = child_index; } +out: + STACK_DESTROY(frame->root); + return 0; } - static void -afr_lookup_done (call_frame_t *frame, xlator_t *this, struct iatt *lookup_buf) +afr_attempt_local_discovery (xlator_t *this, int32_t child_index) { - int unwind = 1; - int source = -1; - int up_count = 0; - char sh_type_str[256] = {0,}; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - priv = this->private; - local = frame->local; - - local->cont.lookup.postparent.ia_ino = local->cont.lookup.parent_ino; - - if (local->cont.lookup.ino) { - local->cont.lookup.buf.ia_ino = local->cont.lookup.ino; - } + call_frame_t *newframe = NULL; + loc_t tmploc = {0,}; + afr_private_t *priv = this->private; - if (local->op_ret == 0) { - /* KLUDGE: assuming DHT will not itransform in - revalidate */ - if (local->cont.lookup.inode->ino) { - local->cont.lookup.buf.ia_ino = - local->cont.lookup.inode->ino; - } - } - up_count = afr_up_children_count (priv->child_count, priv->child_up); - if (up_count == 1) { - gf_log (this->name, GF_LOG_DEBUG, - "Only 1 child up - do not attempt to detect self heal"); - - goto unwind; - } - - if (local->success_count && local->enoent_count) { - local->self_heal.need_metadata_self_heal = _gf_true; - local->self_heal.need_data_self_heal = _gf_true; - local->self_heal.need_entry_self_heal = _gf_true; - gf_log(this->name, GF_LOG_INFO, - "entries are missing in lookup of %s.", - local->loc.path); - } - - if (local->success_count) { - /* check for split-brain case in previous lookup */ - if (afr_is_split_brain (this, local->cont.lookup.inode)) { - local->self_heal.need_data_self_heal = _gf_true; - gf_log(this->name, GF_LOG_WARNING, - "split brain detected during lookup of %s.", - local->loc.path); - } + newframe = create_frame(this,this->ctx->pool); + if (!newframe) { + return; } - if ((local->self_heal.need_metadata_self_heal - || local->self_heal.need_data_self_heal - || local->self_heal.need_entry_self_heal) - && ((!local->cont.lookup.is_revalidate) - || (local->op_ret != -1))) { - - if (local->inodelk_count || local->entrylk_count) { - - /* Someone else is doing self-heal on this file. - So just make a best effort to set the read-subvolume - and return */ - - if (IA_ISREG (local->cont.lookup.inode->ia_type)) { - source = afr_self_heal_get_source (this, local, local->cont.lookup.xattrs); - - if (source >= 0) { - afr_set_read_child (this, - local->cont.lookup.inode, - source); - } - } - goto unwind; - } + tmploc.gfid[sizeof(tmploc.gfid)-1] = 1; + STACK_WIND_COOKIE (newframe, afr_local_discovery_cbk, + (void *)(long)child_index, + priv->children[child_index], + priv->children[child_index]->fops->getxattr, + &tmploc, GF_XATTR_PATHINFO_KEY, NULL); +} - if (!local->cont.lookup.inode->ia_type) { - /* fix for RT #602 */ - local->cont.lookup.inode->ia_type = - lookup_buf->ia_type; - } - local->self_heal.background = _gf_true; - local->self_heal.type = local->cont.lookup.buf.ia_type; - local->self_heal.unwind = afr_self_heal_lookup_unwind; +int +afr_lookup_selfheal_wrap (void *opaque) +{ + call_frame_t *frame = opaque; + afr_local_t *local = NULL; + xlator_t *this = NULL; + inode_t *inode = NULL; - unwind = 0; + local = frame->local; + this = frame->this; - afr_self_heal_type_str_get(&local->self_heal, - sh_type_str, - sizeof(sh_type_str)); + afr_selfheal_name (frame->this, local->loc.pargfid, local->loc.name); - gf_log (this->name, GF_LOG_INFO, - "background %s self-heal triggered. path: %s", - sh_type_str, local->loc.path); + afr_replies_wipe (local, this->private); - afr_self_heal (frame, this); - } + inode = afr_selfheal_unlocked_lookup_on (frame, local->loc.parent, + local->loc.name, local->replies, + local->child_up); + if (inode) + inode_unref (inode); + afr_lookup_done (frame, this); -unwind: - if (unwind) { - AFR_STACK_UNWIND (lookup, frame, local->op_ret, - local->op_errno, - local->cont.lookup.inode, - &local->cont.lookup.buf, - local->cont.lookup.xattr, - &local->cont.lookup.postparent); - } + return 0; } -/* - * During a lookup, some errors are more "important" than - * others in that they must be given higher priority while - * returning to the user. - * - * The hierarchy is ESTALE > ENOENT > others - * - */ - -static gf_boolean_t -__error_more_important (int32_t old_errno, int32_t new_errno) +int +afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this) { - gf_boolean_t ret = _gf_true; - - /* Nothing should ever overwrite ESTALE */ - if (old_errno == ESTALE) - ret = _gf_false; - - /* Nothing should overwrite ENOENT, except ESTALE */ - else if ((old_errno == ENOENT) && (new_errno != ESTALE)) - ret = _gf_false; - - return ret; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + call_frame_t *heal = NULL; + int i = 0, first = -1; + gf_boolean_t need_heal = _gf_false; + struct afr_reply *replies = NULL; + int ret = 0; + + local = frame->local; + replies = local->replies; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (first == -1) { + first = i; + continue; + } + + if (replies[i].op_ret != replies[first].op_ret) { + need_heal = _gf_true; + break; + } + + if (uuid_compare (replies[i].poststat.ia_gfid, + replies[first].poststat.ia_gfid)) { + need_heal = _gf_true; + break; + } + } + + if (need_heal) { + heal = copy_frame (frame); + if (heal) + heal->root->pid = -1; + ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap, + afr_refresh_selfheal_done, heal, frame); + if (ret) + goto lookup_done; + } else { + lookup_done: + afr_lookup_done (frame, this); + } + + return ret; } int -afr_fresh_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) +afr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *buf, + dict_t *xdata, struct iatt *postparent) { afr_local_t * local = NULL; - afr_private_t * priv = NULL; - struct iatt * lookup_buf = NULL; int call_count = -1; int child_index = -1; - int first_up_child = -1; - child_index = (long) cookie; - priv = this->private; + child_index = (long) cookie; - LOCK (&frame->lock); - { - local = frame->local; + local = frame->local; - lookup_buf = &local->cont.lookup.buf; + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (op_ret != -1) { + local->replies[child_index].poststat = *buf; + local->replies[child_index].postparent = *postparent; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + } - if (op_ret == -1) { - if (op_errno == ENOENT) - local->enoent_count++; + call_count = afr_frame_return (frame); + if (call_count == 0) { + afr_lookup_entry_heal (frame, this); + } - if (__error_more_important (local->op_errno, op_errno)) - local->op_errno = op_errno; + return 0; +} - if (local->op_errno == ESTALE) { - local->op_ret = -1; - } - goto unlock; - } - afr_lookup_collect_xattr (local, this, child_index, xattr); +static void +afr_discover_done (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = -1; + int op_errno = 0; + int read_subvol = 0; - first_up_child = afr_first_up_child (priv); + priv = this->private; + local = frame->local; - if (child_index == first_up_child) { - local->cont.lookup.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - first_up_child); - } + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret == 0) + local->op_ret = 0; + } - if (local->success_count == 0) { - if (local->op_errno != ESTALE) - local->op_ret = op_ret; + op_errno = afr_final_errno (frame->local, this->private); - local->cont.lookup.inode = inode_ref (inode); - local->cont.lookup.xattr = dict_ref (xattr); - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - local->cont.lookup.postparent = *postparent; + if (local->op_ret < 0) { + local->op_errno = op_errno; + local->op_ret = -1; + goto unwind; + } - if (priv->first_lookup && inode->ino == 1) { - gf_log (this->name, GF_LOG_INFO, - "added root inode"); - priv->root_inode = inode_ref (inode); - priv->first_lookup = 0; - } + afr_replies_interpret (frame, this, local->inode); - *lookup_buf = *buf; + read_subvol = afr_data_subvol_get (local->inode, this, 0, 0); + if (read_subvol == -1) { + gf_log (this->name, GF_LOG_WARNING, "no read subvols for %s", + local->loc.path); - uuid_copy (local->loc.gfid, buf->ia_gfid); - uuid_copy (local->loc.pargfid, - postparent->ia_gfid); + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid || + local->replies[i].op_ret == -1) + continue; + read_subvol = i; + break; + } + } - lookup_buf->ia_ino = afr_itransform (buf->ia_ino, - priv->child_count, - child_index); - if (priv->read_child >= 0) { - afr_set_read_child (this, - local->cont.lookup.inode, - priv->read_child); - } else { - afr_set_read_child (this, - local->cont.lookup.inode, - child_index); - } +unwind: + if (read_subvol == -1) + read_subvol = 0; - } else { - afr_lookup_self_heal_check (this, local, buf, lookup_buf); + AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->replies[read_subvol].poststat, + local->replies[read_subvol].xdata, + &local->replies[read_subvol].postparent); +} - if (child_index == local->read_child_index) { - /* - lookup has succeeded on the read child. - So use its inode number - */ - if (local->cont.lookup.xattr) - dict_unref (local->cont.lookup.xattr); - local->cont.lookup.xattr = dict_ref (xattr); - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - local->cont.lookup.postparent = *postparent; +int +afr_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *buf, + dict_t *xdata, struct iatt *postparent) +{ + afr_local_t * local = NULL; + int call_count = -1; + int child_index = -1; - *lookup_buf = *buf; + child_index = (long) cookie; - uuid_copy (local->loc.gfid, buf->ia_gfid); - uuid_copy (local->loc.pargfid, - postparent->ia_gfid); - } + local = frame->local; - } + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (op_ret != -1) { + local->replies[child_index].poststat = *buf; + local->replies[child_index].postparent = *postparent; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + } - local->success_count++; - } -unlock: - UNLOCK (&frame->lock); + if (local->do_discovery && (op_ret == 0)) + afr_attempt_local_discovery (this, child_index); call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_lookup_done (frame, this, lookup_buf); + afr_discover_done (frame, this); } - return 0; + return 0; } int -afr_revalidate_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) +afr_discover_do (call_frame_t *frame, xlator_t *this, int err) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - struct iatt * lookup_buf = NULL; - int call_count = -1; - int child_index = -1; - int first_up_child = -1; - - child_index = (long) cookie; - priv = this->private; - - LOCK (&frame->lock); - { - local = frame->local; - - lookup_buf = &local->cont.lookup.buf; - - if (op_ret == -1) { - if (op_errno == ENOENT) - local->enoent_count++; - - if (__error_more_important (local->op_errno, op_errno)) - local->op_errno = op_errno; - - if (local->op_errno == ESTALE) { - local->op_ret = -1; - } + int ret = 0; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + priv = this->private; + + if (err) { + local->op_errno = -err; + ret = -1; + goto out; + } + + call_count = local->call_count = AFR_COUNT (local->child_up, + priv->child_count); + + ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, + &local->loc); + if (ret) { + local->op_errno = -ret; + ret = -1; + goto out; + } - goto unlock; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_discover_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, local->xattr_req); + if (!--call_count) + break; } + } - afr_lookup_collect_xattr (local, this, child_index, xattr); + return 0; +out: + AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0); + return 0; +} - first_up_child = afr_first_up_child (priv); - if (child_index == first_up_child) { - local->cont.lookup.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - first_up_child); - } +int +afr_discover (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + int op_errno = ENOMEM; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int event = 0; - /* in case of revalidate, we need to send stat of the - * child whose stat was sent during the first lookup. - * (so that time stamp does not vary with revalidate. - * in case it is down, stat of the fist success will - * be replied */ + priv = this->private; - /* inode number should be preserved across revalidates */ + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - if (local->success_count == 0) { - if (local->op_errno != ESTALE) - local->op_ret = op_ret; + if (!local->call_count) { + op_errno = ENOTCONN; + goto out; + } - local->cont.lookup.inode = inode_ref (inode); - local->cont.lookup.xattr = dict_ref (xattr); - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - local->cont.lookup.postparent = *postparent; + if (__is_root_gfid (loc->inode->gfid)) { + if (!this->itable) + this->itable = loc->inode->table; + if (!priv->root_inode) + priv->root_inode = inode_ref (loc->inode); - *lookup_buf = *buf; + if (priv->choose_local && !priv->did_discovery) { + /* Logic to detect which subvolumes of AFR are + local, in order to prefer them for reads + */ + local->do_discovery = _gf_true; + priv->did_discovery = _gf_true; + } + } - lookup_buf->ia_ino = afr_itransform (buf->ia_ino, - priv->child_count, - child_index); + local->op = GF_FOP_LOOKUP; - if (priv->read_child >= 0) { - afr_set_read_child (this, - local->cont.lookup.inode, - priv->read_child); - } else { - afr_set_read_child (this, - local->cont.lookup.inode, - child_index); - } + loc_copy (&local->loc, loc); - } else { - afr_lookup_self_heal_check (this, local, buf, lookup_buf); + local->inode = inode_ref (loc->inode); - if (child_index == local->read_child_index) { + if (xattr_req) + /* If xattr_req was null, afr_lookup_xattr_req_prepare() will + allocate one for us */ + local->xattr_req = dict_ref (xattr_req); - /* - lookup has succeeded on the read child. - So use its inode number - */ + if (uuid_is_null (loc->inode->gfid)) { + afr_discover_do (frame, this, 0); + return 0; + } - if (local->cont.lookup.xattr) - dict_unref (local->cont.lookup.xattr); + afr_read_subvol_get (loc->inode, this, NULL, &event, + AFR_DATA_TRANSACTION); - local->cont.lookup.xattr = dict_ref (xattr); - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - local->cont.lookup.postparent = *postparent; + if (event != local->event_generation) + afr_inode_refresh (frame, this, loc->inode, afr_discover_do); + else + afr_discover_do (frame, this, 0); - *lookup_buf = *buf; - } + return 0; +out: + AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; +} - } - local->success_count++; +int +afr_lookup_do (call_frame_t *frame, xlator_t *this, int err) +{ + int ret = 0; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + priv = this->private; + + if (err < 0) { + local->op_errno = -err; + ret = -1; + goto out; + } + + call_count = local->call_count = AFR_COUNT (local->child_up, + priv->child_count); + + ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, + &local->loc); + if (ret) { + local->op_errno = -ret; + ret = -1; + goto out; } -unlock: - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_lookup_done (frame, this, lookup_buf); + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, local->xattr_req); + if (!--call_count) + break; + } } - - return 0; + return 0; +out: + AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0); + return 0; } +/* + * afr_lookup() + * + * The goal here is to figure out what the element getting looked up is. + * i.e what is the GFID, inode type and a conservative estimate of the + * inode attributes are. + * + * As we lookup, operations may be underway on the entry name and the + * inode. In lookup() we are primarily concerned only with the entry + * operations. If the entry is getting unlinked or renamed, we detect + * what operation is underway by querying for on-going transactions and + * pending self-healing on the entry through xdata. + * + * If the entry is a file/dir, it may need self-heal and/or in a + * split-brain condition. Lookup is not the place to worry about these + * conditions. Outcast marking will naturally handle them in the read + * paths. + * + * Here is a brief goal of what we are trying to achieve: + * + * - LOOKUP on all subvolumes concurrently, querying on-going transaction + * and pending self-heal info from the servers. + * + * - If all servers reply the same inode type and GFID, the overall call + * MUST be a success. + * + * - If inode types or GFIDs mismatch, and there IS either an on-going + * transaction or pending self-heal, inspect what the nature of the + * transaction or pending heal is, and select the appropriate subvolume's + * reply as the winner. + * + * - If inode types or GFIDs mismatch, and there are no on-going transactions + * or pending self-heal on the entry name on any of the servers, fail the + * lookup with EIO. Something has gone wrong beyond reasonable action. + */ int -afr_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int ret = -1; - int i = 0; - fop_lookup_cbk_t callback = NULL; - int call_count = 0; - uint64_t ctx = 0; - int32_t op_errno = 0; - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - local->op_ret = -1; - - frame->local = local; - - if (!strcmp (loc->path, "/" GF_REPLICATE_TRASH_DIR)) { - op_errno = ENOENT; +afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + afr_local_t *local = NULL; + int32_t op_errno = 0; + int event = 0; + + if (!loc->parent) { + afr_discover (frame, this, loc, xattr_req); + return 0; + } + + if (__is_root_gfid (loc->parent->gfid)) { + if (!strcmp (loc->name, GF_REPLICATE_TRASH_DIR)) { + op_errno = EPERM; + goto out; + } + } + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + if (!local->call_count) { + op_errno = ENOTCONN; goto out; } + local->op = GF_FOP_LOOKUP; + loc_copy (&local->loc, loc); - ret = inode_ctx_get (loc->inode, this, &ctx); - if (ret == 0) { - /* lookup is a revalidate */ + local->inode = inode_ref (loc->inode); - callback = afr_revalidate_lookup_cbk; + if (xattr_req) + /* If xattr_req was null, afr_lookup_xattr_req_prepare() will + allocate one for us */ + local->xattr_req = dict_ref (xattr_req); - local->cont.lookup.is_revalidate = _gf_true; - local->read_child_index = afr_read_child (this, - loc->inode); - } else { - callback = afr_fresh_lookup_cbk; + afr_read_subvol_get (loc->parent, this, NULL, &event, + AFR_DATA_TRANSACTION); - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); - } + if (event != local->event_generation) + afr_inode_refresh (frame, this, loc->parent, afr_lookup_do); + else + afr_lookup_do (frame, this, 0); - if (loc->parent) - local->cont.lookup.parent_ino = loc->parent->ino; + return 0; +out: + AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); - local->child_up = memdup (priv->child_up, priv->child_count); + return 0; +} - local->cont.lookup.xattrs = GF_CALLOC (priv->child_count, - sizeof (*local->cont.lookup.xattr), - gf_afr_mt_dict_t); - local->call_count = afr_up_children_count (priv->child_count, - local->child_up); - call_count = local->call_count; +/* {{{ open */ - if (local->call_count == 0) { - ret = -1; - op_errno = ENOTCONN; - goto out; - } +afr_fd_ctx_t * +__afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ + uint64_t ctx = 0; + int ret = 0; + afr_fd_ctx_t *fd_ctx = NULL; - /* By default assume ENOTCONN. On success it will be set to 0. */ - local->op_errno = ENOTCONN; + ret = __fd_ctx_get (fd, this, &ctx); - if (xattr_req == NULL) - local->xattr_req = dict_new (); - else - local->xattr_req = dict_ref (xattr_req); + if (ret < 0) { + ret = __afr_fd_ctx_set (this, fd); + if (ret < 0) + goto out; - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_uint64 (local->xattr_req, priv->pending_key[i], - 3 * sizeof(int32_t)); + ret = __fd_ctx_get (fd, this, &ctx); if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, priv->pending_key[i]); - /* 3 = data+metadata+entry */ + goto out; } - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_INODELK_COUNT); - } + fd_ctx = (afr_fd_ctx_t *)(long) ctx; +out: + return fd_ctx; +} - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_ENTRYLK_COUNT); - } - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, callback, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - loc, local->xattr_req); - if (!--call_count) - break; - } - } +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ + afr_fd_ctx_t *fd_ctx = NULL; - ret = 0; -out: - if (ret == -1) - AFR_STACK_UNWIND (lookup, frame, -1, op_errno, - NULL, NULL, NULL, NULL); + LOCK(&fd->lock); + { + fd_ctx = __afr_fd_ctx_get (fd, this); + } + UNLOCK(&fd->lock); - return 0; + return fd_ctx; } -/* {{{ open */ - int -afr_fd_ctx_set (xlator_t *this, fd_t *fd) +__afr_fd_ctx_set (xlator_t *this, fd_t *fd) { afr_private_t * priv = NULL; int ret = -1; uint64_t ctx = 0; afr_fd_ctx_t * fd_ctx = NULL; + int i = 0; VALIDATE_OR_GOTO (this->private, out); VALIDATE_OR_GOTO (fd, out); priv = this->private; - LOCK (&fd->lock); - { - ret = __fd_ctx_get (fd, this, &ctx); + ret = __fd_ctx_get (fd, this, &ctx); - if (ret == 0) - goto unlock; + if (ret == 0) + goto out; - fd_ctx = GF_CALLOC (1, sizeof (afr_fd_ctx_t), - gf_afr_mt_afr_fd_ctx_t); - if (!fd_ctx) { - ret = -ENOMEM; - goto unlock; - } + fd_ctx = GF_CALLOC (1, sizeof (afr_fd_ctx_t), + gf_afr_mt_afr_fd_ctx_t); + if (!fd_ctx) { + ret = -ENOMEM; + goto out; + } - fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_done) { - ret = -ENOMEM; - goto unlock; - } + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]), + priv->child_count, + gf_afr_mt_int32_t); + if (!fd_ctx->pre_op_done[i]) { + ret = -ENOMEM; + goto out; + } + } + + fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), + priv->child_count, + gf_afr_mt_int32_t); + if (!fd_ctx->opened_on) { + ret = -ENOMEM; + goto out; + } - fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_piggyback) { - ret = -ENOMEM; - goto unlock; - } + for (i = 0; i < priv->child_count; i++) { + if (fd_is_anonymous (fd)) + fd_ctx->opened_on[i] = AFR_FD_OPENED; + else + fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED; + } - fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->opened_on) { - ret = -ENOMEM; - goto unlock; - } + fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->lock_piggyback) { + ret = -ENOMEM; + goto out; + } - fd_ctx->up_count = priv->up_count; - fd_ctx->down_count = priv->down_count; + fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->lock_acquired) { + ret = -ENOMEM; + goto out; + } - fd_ctx->locked_on = GF_CALLOC (sizeof (*fd_ctx->locked_on), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->locked_on) { - ret = -ENOMEM; - goto unlock; - } + pthread_mutex_init (&fd_ctx->delay_lock, NULL); - ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set fd ctx (%p)", fd); + INIT_LIST_HEAD (&fd_ctx->eager_locked); - INIT_LIST_HEAD (&fd_ctx->entries); - } -unlock: - UNLOCK (&fd->lock); + ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); + if (ret) + gf_log (this->name, GF_LOG_DEBUG, + "failed to set fd ctx (%p)", fd); out: return ret; } -/* {{{ flush */ int -afr_flush_unwind (call_frame_t *frame, xlator_t *this) +afr_fd_ctx_set (xlator_t *this, fd_t *fd) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; + int ret = -1; - LOCK (&frame->lock); + LOCK (&fd->lock); { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (flush, main_frame, - local->op_ret, local->op_errno); + ret = __afr_fd_ctx_set (this, fd); } + UNLOCK (&fd->lock); - return 0; + return ret; } +/* {{{ flush */ int -afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int child_index = (long) cookie; - int need_unwind = 0; + afr_local_t *local = NULL; + int call_count = -1; local = frame->local; - priv = this->private; LOCK (&frame->lock); { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } - - local->op_errno = op_errno; + local->op_ret = op_ret; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + } else { + local->op_errno = op_errno; + } } UNLOCK (&frame->lock); - if (need_unwind) - afr_flush_unwind (frame, this); + call_count = afr_frame_return (frame); - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } + if (call_count == 0) + AFR_STACK_UNWIND (flush, frame, local->op_ret, + local->op_errno, local->xdata_rsp); return 0; } - -int -afr_flush_wind (call_frame_t *frame, xlator_t *this) +static int +afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int i = 0; - int call_count = -1; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; - local = frame->local; priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; + local = frame->local; + call_count = local->call_count; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_flush_wind_cbk, + STACK_WIND_COOKIE (frame, afr_flush_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->flush, - local->fd); - + local->fd, xdata); if (!--call_count) break; + } } return 0; } - -int -afr_flush_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - - int -afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; - int call_count = 0; + afr_local_t *local = NULL; + call_stub_t *stub = NULL; + int op_errno = ENOMEM; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - priv = this->private; + if (!local->call_count) { + op_errno = ENOTCONN; + goto out; + } - ALLOC_OR_GOTO (local, afr_local_t, out); + local->fd = fd_ref(fd); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata); + if (!stub) goto out; - } - call_count = afr_up_children_count (priv->child_count, local->child_up); - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; - goto out; - } + afr_delayed_changelog_wake_resume (this, fd, stub); - transaction_frame->local = local; - - local->op = GF_FOP_FLUSH; - - local->transaction.fop = afr_flush_wind; - local->transaction.done = afr_flush_done; - local->transaction.unwind = afr_flush_unwind; - - local->fd = fd_ref (fd); - - local->transaction.main_frame = frame; - local->transaction.start = 0; - local->transaction.len = 0; - - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - - - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - - AFR_STACK_UNWIND (flush, frame, op_ret, op_errno); - } - + AFR_STACK_UNWIND (flush, frame, -1, op_errno, NULL); return 0; } @@ -1416,6 +2114,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) uint64_t ctx = 0; afr_fd_ctx_t *fd_ctx = NULL; int ret = 0; + int i = 0; ret = fd_ctx_get (fd, this, &ctx); if (ret < 0) @@ -1424,17 +2123,22 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) fd_ctx = (afr_fd_ctx_t *)(long) ctx; if (fd_ctx) { - if (fd_ctx->pre_op_done) - GF_FREE (fd_ctx->pre_op_done); + //no need to take any locks + if (!list_empty (&fd_ctx->eager_locked)) + gf_log (this->name, GF_LOG_WARNING, "%s: Stale " + "Eager-lock stubs found", + uuid_utoa (fd->inode->gfid)); + + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) + GF_FREE (fd_ctx->pre_op_done[i]); + + GF_FREE (fd_ctx->opened_on); - if (fd_ctx->opened_on) - GF_FREE (fd_ctx->opened_on); + GF_FREE (fd_ctx->lock_piggyback); - if (fd_ctx->locked_on) - GF_FREE (fd_ctx->locked_on); + GF_FREE (fd_ctx->lock_acquired); - if (fd_ctx->pre_op_piggyback) - GF_FREE (fd_ctx->pre_op_piggyback); + pthread_mutex_destroy (&fd_ctx->delay_lock); GF_FREE (fd_ctx); } @@ -1447,24 +2151,8 @@ out: int afr_release (xlator_t *this, fd_t *fd) { - afr_locked_fd_t *locked_fd = NULL; - afr_locked_fd_t *tmp = NULL; - afr_private_t *priv = NULL; - - priv = this->private; - afr_cleanup_fd_ctx (this, fd); - list_for_each_entry_safe (locked_fd, tmp, &priv->saved_fds, - list) { - - if (locked_fd->fd == fd) { - list_del_init (&locked_fd->list); - GF_FREE (locked_fd); - } - - } - return 0; } @@ -1472,54 +2160,87 @@ afr_release (xlator_t *this, fd_t *fd) /* {{{ fsync */ int +afr_fsync_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} + +int afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { afr_local_t *local = NULL; int call_count = -1; int child_index = (long) cookie; - int read_child = 0; + int read_subvol = 0; + call_stub_t *stub = NULL; local = frame->local; - read_child = afr_read_child (this, local->fd->inode); + read_subvol = afr_data_subvol_get (local->inode, this, 0, 0); LOCK (&frame->lock); { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - if (op_ret == 0) { - local->op_ret = 0; + if (local->op_ret == -1) { + local->op_ret = 0; - if (local->success_count == 0) { - local->cont.fsync.prebuf = *prebuf; - local->cont.fsync.postbuf = *postbuf; - } + local->cont.inode_wfop.prebuf = *prebuf; + local->cont.inode_wfop.postbuf = *postbuf; - if (child_index == read_child) { - local->cont.fsync.prebuf = *prebuf; - local->cont.fsync.postbuf = *postbuf; + if (xdata) + local->xdata_rsp = dict_ref (xdata); } - local->success_count++; - } - - local->op_errno = op_errno; + if (child_index == read_subvol) { + local->cont.inode_wfop.prebuf = *prebuf; + local->cont.inode_wfop.postbuf = *postbuf; + if (xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = dict_ref (xdata); + } + } + } else { + local->op_errno = op_errno; + } } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) { - local->cont.fsync.prebuf.ia_ino = local->cont.fsync.ino; - local->cont.fsync.postbuf.ia_ino = local->cont.fsync.ino; - - AFR_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno, - &local->cont.fsync.prebuf, - &local->cont.fsync.postbuf); + /* Make a stub out of the frame, and register it + with the waking up post-op. When the call-stub resumes, + we are guaranteed that there was no post-op pending + (i.e changelogs were unset in the server). This is an + essential "guarantee", that fsync() returns only after + completely finishing EVERYTHING, including the delayed + post-op. This guarantee is expected by FUSE graph switching + for example. + */ + stub = fop_fsync_cbk_stub (frame, afr_fsync_unwind_cbk, + local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + local->xdata_rsp); + if (!stub) { + AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); + return 0; + } + + /* If no new unstable writes happened between the + time we cleared the unstable write witness flag in afr_fsync + and now, calling afr_delayed_changelog_wake_up() should + wake up and skip over the fsync phase and go straight to + afr_changelog_post_op_now() + */ + afr_delayed_changelog_wake_resume (this, local->fd, stub); } return 0; @@ -1527,36 +2248,34 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int -afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync) +afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; + int32_t op_errno = ENOMEM; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + priv = this->private; - priv = this->private; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - ALLOC_OR_GOTO (local, afr_local_t, out); + call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local->fd = fd_ref (fd); - call_count = local->call_count; - frame->local = local; + if (afr_fd_has_witnessed_unstable_write (this, fd)) { + /* don't care. we only wanted to CLEAR the bit */ + } - local->fd = fd_ref (fd); - local->cont.fsync.ino = fd->inode->ino; + local->inode = inode_ref (fd->inode); for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -1564,17 +2283,16 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, (void *) (long) i, priv->children[i], priv->children[i]->fops->fsync, - fd, datasync); + fd, datasync, xdata); if (!--call_count) break; } } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, NULL, NULL); - } + AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } @@ -1582,9 +2300,9 @@ out: /* {{{ fsync */ -int32_t -afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) +int +afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; int call_count = -1; @@ -1593,10 +2311,13 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, LOCK (&frame->lock); { - if (op_ret == 0) + if (op_ret == 0) { local->op_ret = 0; - - local->op_errno = op_errno; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + } else { + local->op_errno = op_errno; + } } UNLOCK (&frame->lock); @@ -1604,57 +2325,49 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret, - local->op_errno); + local->op_errno, local->xdata_rsp); return 0; } -int32_t -afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync) +int +afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; + int32_t op_errno = ENOMEM; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; + priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; - frame->local = local; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_fsyncdir_cbk, priv->children[i], priv->children[i]->fops->fsyncdir, - fd, datasync); + fd, datasync, xdata); if (!--call_count) break; } } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fsyncdir, frame, op_ret, op_errno); - } + AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); + return 0; } @@ -1665,7 +2378,7 @@ out: int32_t afr_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) + dict_t *xattr, dict_t *xdata) { afr_local_t *local = NULL; int call_count = -1; @@ -1674,8 +2387,15 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, LOCK (&frame->lock); { - if (op_ret == 0) + if (op_ret == 0) { + if (!local->cont.xattrop.xattr) + local->cont.xattrop.xattr = dict_ref (xattr); + + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + local->op_ret = 0; + } local->op_errno = op_errno; } @@ -1685,7 +2405,7 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (xattrop, frame, local->op_ret, local->op_errno, - xattr); + local->cont.xattrop.xattr, local->xdata_rsp); return 0; } @@ -1693,49 +2413,41 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, int32_t afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t optype, dict_t *xattr) + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; - frame->local = local; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_xattrop_cbk, priv->children[i], priv->children[i]->fops->xattrop, - loc, optype, xattr); + loc, optype, xattr, xdata); if (!--call_count) break; } } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (xattrop, frame, op_ret, op_errno, NULL); - } + AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); + return 0; } @@ -1746,7 +2458,7 @@ out: int32_t afr_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) + dict_t *xattr, dict_t *xdata) { afr_local_t *local = NULL; @@ -1756,8 +2468,14 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, LOCK (&frame->lock); { - if (op_ret == 0) + if (op_ret == 0) { + if (!local->cont.fxattrop.xattr) + local->cont.fxattrop.xattr = dict_ref (xattr); + + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); local->op_ret = 0; + } local->op_errno = op_errno; } @@ -1767,7 +2485,7 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fxattrop, frame, local->op_ret, local->op_errno, - xattr); + local->cont.fxattrop.xattr, local->xdata_rsp); return 0; } @@ -1775,49 +2493,41 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, int32_t afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_xattrop_flags_t optype, dict_t *xattr) + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) goto out; - } call_count = local->call_count; - frame->local = local; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_fxattrop_cbk, priv->children[i], priv->children[i]->fops->fxattrop, - fd, optype, xattr); + fd, optype, xattr, xdata); if (!--call_count) break; } } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, NULL); - } + AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); + return 0; } @@ -1825,8 +2535,8 @@ out: int32_t -afr_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) +afr_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -1847,7 +2557,7 @@ afr_inodelk_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (inodelk, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } @@ -1855,57 +2565,50 @@ afr_inodelk_cbk (call_frame_t *frame, void *cookie, int32_t afr_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock) + const char *volume, loc_t *loc, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) goto out; - } call_count = local->call_count; - frame->local = local; + if (!call_count) { + op_errno = ENOMEM; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_inodelk_cbk, priv->children[i], priv->children[i]->fops->inodelk, - volume, loc, cmd, flock); + volume, loc, cmd, flock, xdata); if (!--call_count) break; } } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (inodelk, frame, op_ret, op_errno); - } + AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); + return 0; } int32_t -afr_finodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) +afr_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -1926,66 +2629,57 @@ afr_finodelk_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (finodelk, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } int32_t -afr_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock) +afr_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *flock, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; - frame->local = local; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_finodelk_cbk, priv->children[i], priv->children[i]->fops->finodelk, - volume, fd, cmd, flock); + volume, fd, cmd, flock, xdata); if (!--call_count) break; } } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (finodelk, frame, op_ret, op_errno); - } + AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); + return 0; } int32_t -afr_entrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - +afr_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; int call_count = -1; @@ -2005,67 +2699,59 @@ afr_entrylk_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (entrylk, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } -int32_t -afr_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, - const char *basename, entrylk_cmd cmd, entrylk_type type) +int +afr_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; - frame->local = local; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_entrylk_cbk, priv->children[i], priv->children[i]->fops->entrylk, - volume, loc, basename, cmd, type); + volume, loc, basename, cmd, type, xdata); if (!--call_count) break; } } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (entrylk, frame, op_ret, op_errno); - } + AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); + return 0; } -int32_t -afr_fentrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) +int +afr_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -2086,156 +2772,148 @@ afr_fentrylk_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fentrylk, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } -int32_t -afr_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, - const char *basename, entrylk_cmd cmd, entrylk_type type) +int +afr_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0;< |
