diff options
Diffstat (limited to 'xlators/cluster')
52 files changed, 36511 insertions, 0 deletions
diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am new file mode 100644 index 000000000..a6ddb3564 --- /dev/null +++ b/xlators/cluster/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = unify stripe afr dht ha map + +CLEANFILES = diff --git a/xlators/cluster/afr/Makefile.am b/xlators/cluster/afr/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/cluster/afr/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am new file mode 100644 index 000000000..1bde9e5ba --- /dev/null +++ b/xlators/cluster/afr/src/Makefile.am @@ -0,0 +1,20 @@ +xlator_LTLIBRARIES = afr.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +afr_la_LDFLAGS = -module -avoidversion + +afr_la_SOURCES = afr.c afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c +afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + +uninstall-local: + rm -f $(DESTDIR)$(xlatordir)/replicate.so + +install-data-hook: + ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so
\ No newline at end of file diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c new file mode 100644 index 000000000..0c65ca852 --- /dev/null +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -0,0 +1,345 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" + + +int32_t +afr_opendir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + fd_t *fd) +{ + afr_local_t * local = NULL; + + int call_count = -1; + + LOCK (&frame->lock); + { + local = frame->local; + + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + AFR_STACK_UNWIND (frame, local->op_ret, + local->op_errno, local->fd); + } + + return 0; +} + + +int32_t +afr_opendir (call_frame_t *frame, xlator_t *this, + loc_t *loc, fd_t *fd) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int child_count = 0; + int i = 0; + + int ret = -1; + int call_count = -1; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + child_count = priv->child_count; + + ALLOC_OR_GOTO (local, afr_local_t, out); + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + local->fd = fd_ref (fd); + + call_count = local->call_count; + + for (i = 0; i < child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_opendir_cbk, + priv->children[i], + priv->children[i]->fops->opendir, + loc, fd); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, fd); + } + + return 0; +} + + +/** + * Common algorithm for directory read calls: + * + * - Try the fop on the first child that is up + * - if we have failed due to ENOTCONN: + * try the next child + * + * Applicable to: readdir + */ + +int32_t +afr_readdir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + gf_dirent_t *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + last_tried = local->cont.readdir.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + + this_try = ++local->cont.readdir.last_tried; + unwind = 0; + + STACK_WIND (frame, afr_readdir_cbk, + children[this_try], + children[this_try]->fops->readdir, + local->fd, local->cont.readdir.size, + local->cont.readdir.offset); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); + } + + return 0; +} + + +int32_t +afr_readdir (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset) +{ + afr_private_t * priv = NULL; + xlator_t ** children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + + int ret = -1; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.readdir.last_tried = call_child; + + local->fd = fd_ref (fd); + local->cont.readdir.size = size; + local->cont.readdir.offset = offset; + + STACK_WIND (frame, afr_readdir_cbk, + children[call_child], children[call_child]->fops->readdir, + fd, size, offset); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + return 0; +} + + +int32_t +afr_getdents_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dir_entry_t *entry, int32_t count) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + last_tried = local->cont.getdents.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + + this_try = ++local->cont.getdents.last_tried; + unwind = 0; + + STACK_WIND (frame, afr_getdents_cbk, + children[this_try], + children[this_try]->fops->getdents, + local->fd, local->cont.getdents.size, + local->cont.getdents.offset, local->cont.getdents.flag); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, entry, count); + } + + return 0; +} + + +int32_t +afr_getdents (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset, int32_t flag) +{ + afr_private_t * priv = NULL; + xlator_t ** children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.getdents.last_tried = call_child; + + local->fd = fd_ref (fd); + + local->cont.getdents.size = size; + local->cont.getdents.offset = offset; + local->cont.getdents.flag = flag; + + frame->local = local; + + STACK_WIND (frame, afr_getdents_cbk, + children[call_child], children[call_child]->fops->getdents, + fd, size, offset, flag); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + + diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h new file mode 100644 index 000000000..172ec3c90 --- /dev/null +++ b/xlators/cluster/afr/src/afr-dir-read.h @@ -0,0 +1,47 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __DIR_READ_H__ +#define __DIR_READ_H__ + + +int32_t +afr_opendir (call_frame_t *frame, xlator_t *this, + loc_t *loc, fd_t *fd); + +int32_t +afr_closedir (call_frame_t *frame, xlator_t *this, + fd_t *fd); + +int32_t +afr_readdir (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset); + + +int32_t +afr_getdents (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset, int32_t flag); + + +int32_t +afr_checksum (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags); + + +#endif /* __DIR_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c new file mode 100644 index 000000000..87a6e09b5 --- /dev/null +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -0,0 +1,1786 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" +#include "afr-transaction.h" + + +void +afr_build_parent_loc (loc_t *parent, loc_t *child) +{ + char *tmp = NULL; + + if (!child->parent) { + loc_copy (parent, child); + return; + } + + tmp = strdup (child->path); + parent->path = strdup (dirname (tmp)); + FREE (tmp); + + parent->name = strrchr (parent->path, '/'); + if (parent->name) + parent->name++; + + parent->inode = inode_ref (child->parent); + parent->parent = inode_parent (parent->inode, 0, NULL); + parent->ino = parent->inode->ino; +} + + +/* {{{ create */ + +int +afr_create_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + local->cont.create.fd, + local->cont.create.inode, + &local->cont.create.buf); + return 0; +} + + +int +afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t *fd, inode_t *inode, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) + || (child_index == priv->read_child)) { + local->cont.create.buf = *buf; + local->cont.create.buf.st_ino = + afr_itransform (buf->st_ino, + priv->child_count, + child_index); + } + local->cont.create.inode = inode; + + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_create_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_create_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->create, + &local->loc, + local->cont.create.flags, + local->cont.create.mode, + local->cont.create.fd); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_create_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, fd_t *fd) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->cont.create.flags = flags; + local->cont.create.mode = mode; + local->cont.create.fd = fd_ref (fd); + + local->transaction.fop = afr_create_wind; + local->transaction.done = afr_create_done; + local->transaction.unwind = afr_create_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ mknod */ + +int +afr_mknod_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + local->cont.mknod.inode, + &local->cont.mknod.buf); + return 0; +} + + +int +afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) + || (child_index == priv->read_child)) { + local->cont.mknod.buf = *buf; + local->cont.mknod.buf.st_ino = + afr_itransform (buf->st_ino, + priv->child_count, + child_index); + } + local->cont.mknod.inode = inode; + + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_mknod_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->mknod, + &local->loc, local->cont.mknod.mode, + local->cont.mknod.dev); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_mknod_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_mknod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, dev_t dev) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->cont.mknod.mode = mode; + local->cont.mknod.dev = dev; + + local->transaction.fop = afr_mknod_wind; + local->transaction.done = afr_mknod_done; + local->transaction.unwind = afr_mknod_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ mkdir */ + + +int +afr_mkdir_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + local->cont.mkdir.inode, + &local->cont.mkdir.buf); + return 0; +} + + +int +afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) + || (child_index == priv->read_child)) { + local->cont.mkdir.buf = *buf; + local->cont.mkdir.buf.st_ino = + afr_itransform (buf->st_ino, priv->child_count, + child_index); + } + local->cont.mkdir.inode = inode; + + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_mkdir_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->mkdir, + &local->loc, local->cont.mkdir.mode); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_mkdir_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->cont.mkdir.mode = mode; + + local->transaction.fop = afr_mkdir_wind; + local->transaction.done = afr_mkdir_done; + local->transaction.unwind = afr_mkdir_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ link */ + + +int +afr_link_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.link.buf.st_ino = local->cont.link.ino; + + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + local->cont.link.inode, + &local->cont.link.buf); + } + + return 0; +} + + +int +afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) + || (child_index == priv->read_child)) { + local->cont.link.buf = *buf; + local->cont.link.buf.st_ino = + afr_itransform (buf->st_ino, priv->child_count, + child_index); + } + local->cont.link.inode = inode; + + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_link_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->link, + &local->loc, + &local->newloc); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_link_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, oldloc); + loc_copy (&local->newloc, newloc); + + local->cont.link.ino = oldloc->inode->ino; + + local->transaction.fop = afr_link_wind; + local->transaction.done = afr_link_done; + local->transaction.unwind = afr_link_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, oldloc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (oldloc->path); + local->transaction.new_basename = AFR_BASENAME (newloc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + +/* }}} */ + +/* {{{ symlink */ + + +int +afr_symlink_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + local->cont.symlink.inode, + &local->cont.symlink.buf); + return 0; +} + + +int +afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) + || (child_index == priv->read_child)) { + local->cont.symlink.buf = *buf; + local->cont.symlink.buf.st_ino = + afr_itransform (buf->st_ino, priv->child_count, + child_index); + } + local->cont.symlink.inode = inode; + + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_symlink_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->symlink, + local->cont.symlink.linkpath, + &local->loc); + + if (!--call_count) + break; + + } + } + + return 0; +} + + +int +afr_symlink_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_symlink (call_frame_t *frame, xlator_t *this, + const char *linkpath, loc_t *loc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->cont.symlink.ino = loc->inode->ino; + local->cont.symlink.linkpath = strdup (linkpath); + + local->transaction.fop = afr_symlink_wind; + local->transaction.done = afr_symlink_done; + local->transaction.unwind = afr_symlink_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ rename */ + +int +afr_rename_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.rename.buf.st_ino = local->cont.rename.ino; + + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.rename.buf); + } + + return 0; +} + + +int +afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if ((op_ret != -1) && (local->success_count == 0)) { + local->op_ret = op_ret; + + if (buf) { + local->cont.rename.buf = *buf; + local->cont.rename.buf.st_ino = + afr_itransform (buf->st_ino, priv->child_count, + child_index); + } + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_rename_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->rename, + &local->loc, + &local->newloc); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_rename_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, oldloc); + loc_copy (&local->newloc, newloc); + + local->cont.rename.ino = oldloc->inode->ino; + + local->transaction.fop = afr_rename_wind; + local->transaction.done = afr_rename_done; + local->transaction.unwind = afr_rename_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, oldloc); + afr_build_parent_loc (&local->transaction.new_parent_loc, newloc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (oldloc->path); + local->transaction.new_basename = AFR_BASENAME (newloc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ unlink */ + +int +afr_unlink_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_unlink_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->unlink, + &local->loc); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int32_t +afr_unlink_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int32_t +afr_unlink (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->transaction.fop = afr_unlink_wind; + local->transaction.done = afr_unlink_done; + local->transaction.unwind = afr_unlink_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + +/* }}} */ + +/* {{{ rmdir */ + + + +int +afr_rmdir_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + } + local->success_count++; + + if (local->success_count == priv->wait_count) + need_unwind = 1; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_rmdir_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->rmdir, + &local->loc); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_rmdir_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_rmdir (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->transaction.fop = afr_rmdir_wind; + local->transaction.done = afr_rmdir_done; + local->transaction.unwind = afr_rmdir_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + +/* }}} */ + +/* {{{ setdents */ + +int32_t +afr_setdents_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if ((op_ret != -1) && (local->success_count == 0)) { + local->op_ret = op_ret; + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_setdents_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_setdents_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->setdents, + local->fd, local->cont.setdents.flags, + local->cont.setdents.entries, + local->cont.setdents.count); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int32_t +afr_setdents_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_setdents (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + + local->fd = fd_ref (fd); + + local->cont.setdents.flags = flags; + local->cont.setdents.entries = entries; + local->cont.setdents.count = count; + + local->transaction.fop = afr_setdents_wind; + local->transaction.done = afr_setdents_done; + + local->transaction.basename = NULL; + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + +/* }}} */ diff --git a/xlators/cluster/afr/src/afr-dir-write.h b/xlators/cluster/afr/src/afr-dir-write.h new file mode 100644 index 000000000..e6e8a5e79 --- /dev/null +++ b/xlators/cluster/afr/src/afr-dir-write.h @@ -0,0 +1,59 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __DIR_WRITE_H__ +#define __DIR_WRITE_H__ + +int32_t +afr_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, fd_t *fd); + +int32_t +afr_mknod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, dev_t dev); + +int32_t +afr_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode); + +int32_t +afr_unlink (call_frame_t *frame, xlator_t *this, + loc_t *loc); + +int32_t +afr_rmdir (call_frame_t *frame, xlator_t *this, + loc_t *loc); + +int32_t +afr_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc); + +int32_t +afr_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc); + +int32_t +afr_symlink (call_frame_t *frame, xlator_t *this, + const char *linkpath, loc_t *oldloc); + +int32_t +afr_setdents (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count); + +#endif /* __DIR_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c new file mode 100644 index 000000000..a6c99ec05 --- /dev/null +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -0,0 +1,721 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" + + +/** + * Common algorithm for inode read calls: + * + * - Try the fop on the first child that is up + * - if we have failed due to ENOTCONN: + * try the next child + * + * Applicable to: access, stat, fstat, readlink, getxattr + */ + +/* {{{ access */ + +int32_t +afr_access_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + last_tried = local->cont.access.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.access.last_tried; + + unwind = 0; + + STACK_WIND_COOKIE (frame, afr_access_cbk, + (void *) (long) this_try, + children[this_try], + children[this_try]->fops->access, + &local->loc, local->cont.access.mask); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + + +int32_t +afr_access (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t mask) +{ + afr_private_t * priv = NULL; + xlator_t ** children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.access.last_tried = call_child; + loc_copy (&local->loc, loc); + local->cont.access.mask = mask; + + STACK_WIND_COOKIE (frame, afr_access_cbk, + (void *) (long) call_child, + children[call_child], children[call_child]->fops->access, + loc, mask); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + +/* }}} */ + +/* {{{ stat */ + +int32_t +afr_stat_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int deitransform_child = -1; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + deitransform_child = (long) cookie; + + local = frame->local; + + if (op_ret == -1) { + retry: + last_tried = local->cont.stat.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.stat.last_tried; + + if (this_try == deitransform_child) { + goto retry; + } + + unwind = 0; + + STACK_WIND_COOKIE (frame, afr_stat_cbk, + (void *) (long) deitransform_child, + children[this_try], + children[this_try]->fops->stat, + &local->loc); + } + +out: + if (unwind) { + if (op_ret != -1) + buf->st_ino = local->cont.stat.ino; + + AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); + } + + return 0; +} + + +int32_t +afr_stat (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int call_child = 0; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + frame->local = local; + + call_child = afr_deitransform (loc->inode->ino, priv->child_count); + loc_copy (&local->loc, loc); + + /* + if stat fails from the deitranform'd child, we try + all children starting with the first one + */ + local->cont.stat.last_tried = -1; + local->cont.stat.ino = loc->inode->ino; + + STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child, + children[call_child], + children[call_child]->fops->stat, + loc); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + + +/* }}} */ + +/* {{{ fstat */ + +int32_t +afr_fstat_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int deitransform_child = -1; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + deitransform_child = (long) cookie; + + local = frame->local; + + if (op_ret == -1) { + retry: + last_tried = local->cont.fstat.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.fstat.last_tried; + + if (this_try == deitransform_child) { + /* + skip the deitransform'd child since if we are here + we must have already tried that child + */ + goto retry; + } + + + unwind = 0; + + STACK_WIND_COOKIE (frame, afr_fstat_cbk, + (void *) (long) deitransform_child, + children[this_try], + children[this_try]->fops->fstat, + local->fd); + } + +out: + if (unwind) { + if (op_ret != -1) + buf->st_ino = local->cont.fstat.ino; + + AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); + } + + return 0; +} + + +int32_t +afr_fstat (call_frame_t *frame, xlator_t *this, + fd_t *fd) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int call_child = 0; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + frame->local = local; + + VALIDATE_OR_GOTO (fd->inode, out); + + call_child = afr_deitransform (fd->inode->ino, priv->child_count); + + /* + if fstat fails from the deitranform'd child, we try + all children starting with the first one + */ + local->cont.fstat.last_tried = -1; + local->cont.fstat.ino = fd->inode->ino; + local->fd = fd_ref (fd); + + STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child, + children[call_child], + children[call_child]->fops->fstat, + fd); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ readlink */ + +int32_t +afr_readlink_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + const char *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + last_tried = local->cont.readlink.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.readlink.last_tried; + + unwind = 0; + STACK_WIND_COOKIE (frame, afr_readlink_cbk, + (void *) (long) this_try, + children[this_try], + children[this_try]->fops->readlink, + &local->loc, + local->cont.readlink.size); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); + } + + return 0; +} + + +int32_t +afr_readlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, size_t size) +{ + afr_private_t * priv = NULL; + xlator_t ** children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + frame->local = local; + + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.readlink.last_tried = call_child; + loc_copy (&local->loc, loc); + local->cont.readlink.size = size; + + STACK_WIND_COOKIE (frame, afr_readlink_cbk, + (void *) (long) call_child, + children[call_child], children[call_child]->fops->readlink, + loc, size); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + return 0; +} + + +/* }}} */ + +/* {{{ getxattr */ + +int32_t +afr_getxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + last_tried = local->cont.getxattr.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.getxattr.last_tried; + + unwind = 0; + STACK_WIND_COOKIE (frame, afr_getxattr_cbk, + (void *) (long) this_try, + children[this_try], + children[this_try]->fops->getxattr, + &local->loc, + local->cont.getxattr.name); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, dict); + } + + return 0; +} + + +int32_t +afr_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name) +{ + afr_private_t * priv = NULL; + xlator_t ** children = NULL; + int call_child = 0; + afr_local_t * local = NULL; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + frame->local = local; + + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.getxattr.last_tried = call_child; + loc_copy (&local->loc, loc); + if (name) + local->cont.getxattr.name = strdup (name); + + STACK_WIND_COOKIE (frame, afr_getxattr_cbk, + (void *) (long) call_child, + children[call_child], children[call_child]->fops->getxattr, + loc, name); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + return 0; +} + + +/* }}} */ + +/* {{{ readv */ + +/** + * read algorithm: + * + * if the user has specified a read subvolume, use it + * otherwise - + * use the inode number to hash it to one of the subvolumes, and + * read from there (to balance read load) + * + * if any of the above read's fail, try the children in sequence + * beginning at the beginning + */ + +int32_t +afr_readv_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + retry: + last_tried = local->cont.readv.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.readv.last_tried; + + if (this_try == priv->read_child) { + /* + skip the read child since if we are here + we must have already tried that child + */ + goto retry; + } + + unwind = 0; + + STACK_WIND_COOKIE (frame, afr_readv_cbk, + (void *) (long) this_try, + children[this_try], + children[this_try]->fops->readv, + local->fd, local->cont.readv.size, + local->cont.readv.offset); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, vector, count, buf); + } + + return 0; +} + + +int32_t +afr_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int call_child = 0; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (fd, out); + + priv = this->private; + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + frame->local = local; + + if (priv->read_child != -1) { + call_child = priv->read_child; + + /* + if read fails from the read child, we try + all children starting with the first one + */ + local->cont.readv.last_tried = -1; + } else { + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.readv.last_tried = call_child; + } + + local->fd = fd_ref (fd); + + local->cont.readv.size = size; + local->cont.readv.offset = offset; + + STACK_WIND_COOKIE (frame, afr_readv_cbk, + (void *) (long) call_child, + children[call_child], + children[call_child]->fops->readv, + fd, size, offset); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, 0, NULL); + } + return 0; +} + +/* }}} */ diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h new file mode 100644 index 000000000..6b3bd2da8 --- /dev/null +++ b/xlators/cluster/afr/src/afr-inode-read.h @@ -0,0 +1,47 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __INODE_READ_H__ +#define __INODE_READ_H__ + +int32_t +afr_access (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t mask); + +int32_t +afr_stat (call_frame_t *frame, xlator_t *this, + loc_t *loc); + +int32_t +afr_fstat (call_frame_t *frame, xlator_t *this, + fd_t *fd); + +int32_t +afr_readlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, size_t size); + +int32_t +afr_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset); + +int32_t +afr_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name); + +#endif /* __INODE_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c new file mode 100644 index 000000000..267350b2c --- /dev/null +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -0,0 +1,2024 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" +#include "afr-transaction.h" + + +/* {{{ chmod */ + + +int +afr_chmod_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.chmod.buf.st_ino = local->cont.chmod.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.chmod.buf); + } + return 0; +} + + +int +afr_chmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.chmod.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + afr_chmod_unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_chmod_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int i = 0; + int call_count = -1; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_chmod_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->chmod, + &local->loc, + local->cont.chmod.mode); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_chmod_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int32_t +afr_chmod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->cont.chmod.mode = mode; + local->cont.chmod.ino = loc->inode->ino; + + local->transaction.fop = afr_chmod_wind; + local->transaction.done = afr_chmod_done; + local->transaction.unwind = afr_chmod_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + + +/* {{{ fchmod */ + +int +afr_fchmod_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.fchmod.buf.st_ino = local->cont.fchmod.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.fchmod.buf); + } + return 0; +} + + +int +afr_fchmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.fchmod.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + afr_fchmod_unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_fchmod_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int i = 0; + int call_count = -1; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_fchmod_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fchmod, + local->fd, + local->cont.fchmod.mode); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_fchmod_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int32_t +afr_fchmod (call_frame_t *frame, xlator_t *this, + fd_t *fd, mode_t mode) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->cont.fchmod.mode = mode; + local->cont.fchmod.ino = fd->inode->ino; + + local->transaction.fop = afr_fchmod_wind; + local->transaction.done = afr_fchmod_done; + local->transaction.unwind = afr_fchmod_unwind; + + local->fd = fd_ref (fd); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ chown */ + +int +afr_chown_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.chown.buf.st_ino = local->cont.chown.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.chown.buf); + } + return 0; +} + + +int +afr_chown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.chown.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) { + local->transaction.unwind (frame, this); + } + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_chown_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_chown_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->chown, + &local->loc, local->cont.chown.uid, + local->cont.chown.gid); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_chown_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_chown (call_frame_t *frame, xlator_t *this, + loc_t *loc, uid_t uid, gid_t gid) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->cont.chown.uid = uid; + local->cont.chown.gid = gid; + local->cont.chown.ino = loc->inode->ino; + + local->transaction.fop = afr_chown_wind; + local->transaction.done = afr_chown_done; + local->transaction.unwind = afr_chown_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + + +/* }}} */ + +/* {{{ chown */ + +int +afr_fchown_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.fchown.buf.st_ino = local->cont.fchown.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.fchown.buf); + } + return 0; +} + + +int +afr_fchown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.fchown.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) { + local->transaction.unwind (frame, this); + } + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_fchown_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_fchown_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fchown, + local->fd, local->cont.fchown.uid, + local->cont.fchown.gid); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_fchown_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_fchown (call_frame_t *frame, xlator_t *this, + fd_t *fd, uid_t uid, gid_t gid) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->cont.fchown.uid = uid; + local->cont.fchown.gid = gid; + local->cont.fchown.ino = fd->inode->ino; + + local->transaction.fop = afr_fchown_wind; + local->transaction.done = afr_fchown_done; + local->transaction.unwind = afr_fchown_unwind; + + local->fd = fd_ref (fd); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ writev */ + +int +afr_writev_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.writev.buf.st_ino = local->cont.writev.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.writev.buf); + } + return 0; +} + + +int +afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.writev.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_writev_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int i = 0; + int call_count = -1; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->writev, + local->fd, + local->cont.writev.vector, + local->cont.writev.count, + local->cont.writev.offset); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_writev_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (local->cont.writev.refs) + dict_unref (local->cont.writev.refs); + local->cont.writev.refs = NULL; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op = GF_FOP_WRITE; + local->cont.writev.vector = iov_dup (vector, count); + local->cont.writev.count = count; + local->cont.writev.offset = offset; + local->cont.writev.ino = fd->inode->ino; + + if (frame->root->req_refs) + local->cont.writev.refs = dict_ref (frame->root->req_refs); + + local->transaction.fop = afr_writev_wind; + local->transaction.done = afr_writev_done; + local->transaction.unwind = afr_writev_unwind; + + local->fd = fd_ref (fd); + + local->transaction.main_frame = frame; + if (fd->flags & O_APPEND) { + local->transaction.start = 0; + local->transaction.len = 0; + } else { + local->transaction.start = offset; + local->transaction.len = iov_length (vector, count); + } + + local->transaction.pending = AFR_DATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + + +/* }}} */ + +/* {{{ truncate */ + +int +afr_truncate_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.truncate.buf.st_ino = local->cont.truncate.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.truncate.buf); + } + return 0; +} + + +int +afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.truncate.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_truncate_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->truncate, + &local->loc, + local->cont.truncate.offset); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_truncate_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_truncate (call_frame_t *frame, xlator_t *this, + loc_t *loc, off_t offset) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op_ret = -1; + + local->cont.truncate.offset = offset; + local->cont.truncate.ino = loc->inode->ino; + + local->transaction.fop = afr_truncate_wind; + local->transaction.done = afr_truncate_done; + local->transaction.unwind = afr_truncate_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = offset; + local->transaction.pending = AFR_DATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + + +/* }}} */ + +/* {{{ ftruncate */ + + +int +afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.ftruncate.buf.st_ino = local->cont.ftruncate.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.ftruncate.buf); + } + return 0; +} + + +int +afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.ftruncate.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->ftruncate, + local->fd, local->cont.ftruncate.offset); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_ftruncate_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_ftruncate (call_frame_t *frame, xlator_t *this, + fd_t *fd, off_t offset) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op = GF_FOP_FTRUNCATE; + local->op_ret = -1; + + local->cont.ftruncate.offset = offset; + local->cont.ftruncate.ino = fd->inode->ino; + + local->transaction.fop = afr_ftruncate_wind; + local->transaction.done = afr_ftruncate_done; + local->transaction.unwind = afr_ftruncate_unwind; + + local->fd = fd_ref (fd); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = offset; + local->transaction.pending = AFR_DATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ utimens */ + + +int +afr_utimens_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.utimens.buf.st_ino = local->cont.utimens.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.utimens.buf); + } + return 0; +} + + +int +afr_utimens_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 1; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.utimens.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_utimens_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_utimens_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->utimens, + &local->loc, + local->cont.utimens.tv); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_utimens_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_utimens (call_frame_t *frame, xlator_t *this, + loc_t *loc, struct timespec tv[2]) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op_ret = -1; + + local->cont.utimens.tv[0] = tv[0]; + local->cont.utimens.tv[1] = tv[1]; + + local->cont.utimens.ino = loc->inode->ino; + + local->transaction.fop = afr_utimens_wind; + local->transaction.done = afr_utimens_done; + local->transaction.unwind = afr_utimens_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ setxattr */ + + +int +afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno) + } + return 0; +} + + +int +afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_setxattr_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->setxattr, + &local->loc, + local->cont.setxattr.dict, + local->cont.setxattr.flags); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_setxattr_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_setxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *dict, int32_t flags) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op_ret = -1; + + local->cont.setxattr.dict = dict_ref (dict); + local->cont.setxattr.flags = flags; + + local->transaction.fop = afr_setxattr_wind; + local->transaction.done = afr_setxattr_done; + local->transaction.unwind = afr_setxattr_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + +/* }}} */ + +/* {{{ removexattr */ + + +int +afr_removexattr_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno) + } + return 0; +} + + +int +afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_removexattr_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->removexattr, + &local->loc, + local->cont.removexattr.name); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_removexattr_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op_ret = -1; + + local->cont.removexattr.name = strdup (name); + + local->transaction.fop = afr_removexattr_wind; + local->transaction.done = afr_removexattr_done; + local->transaction.unwind = afr_removexattr_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h new file mode 100644 index 000000000..9c0b5cad3 --- /dev/null +++ b/xlators/cluster/afr/src/afr-inode-write.h @@ -0,0 +1,63 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __INODE_WRITE_H__ +#define __INODE_WRITE_H__ + +int32_t +afr_chmod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode); + +int32_t +afr_chown (call_frame_t *frame, xlator_t *this, + loc_t *loc, uid_t uid, gid_t gid); + +int +afr_fchown (call_frame_t *frame, xlator_t *this, + fd_t *fd, uid_t uid, gid_t gid); + +int32_t +afr_fchmod (call_frame_t *frame, xlator_t *this, + fd_t *fd, mode_t mode); + +int32_t +afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset); + +int32_t +afr_truncate (call_frame_t *frame, xlator_t *this, + loc_t *loc, off_t offset); + +int32_t +afr_ftruncate (call_frame_t *frame, xlator_t *this, + fd_t *fd, off_t offset); + +int32_t +afr_utimens (call_frame_t *frame, xlator_t *this, + loc_t *loc, struct timespec tv[2]); + +int32_t +afr_setxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *dict, int32_t flags); + +int32_t +afr_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name); + +#endif /* __INODE_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c new file mode 100644 index 000000000..45d065169 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -0,0 +1,1073 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include "glusterfs.h" +#include "xlator.h" +#include "byte-order.h" + +#include "afr.h" +#include "afr-transaction.h" +#include "afr-self-heal-common.h" +#include "afr-self-heal.h" + + +/** + * select_source - select a source and return it + * TODO: take into account option 'favorite-child' + */ + +int +afr_sh_select_source (int sources[], int child_count) +{ + int i; + for (i = 0; i < child_count; i++) + if (sources[i]) + return i; + + return -1; +} + + +/** + * sink_count - return number of sinks in sources array + */ + +int +afr_sh_sink_count (int sources[], int child_count) +{ + int i; + int sinks = 0; + for (i = 0; i < child_count; i++) + if (!sources[i]) + sinks++; + return sinks; +} + +int +afr_sh_source_count (int sources[], int child_count) +{ + int i; + int nsource = 0; + + for (i = 0; i < child_count; i++) + if (sources[i]) + nsource++; + return nsource; +} + + +int +afr_sh_supress_errenous_children (int sources[], int child_errno[], + int child_count) +{ + int i = 0; + + for (i = 0; i < child_count; i++) { + if (child_errno[i] && sources[i]) { + sources[i] = 0; + } + } + + return 0; +} + + +int +afr_sh_supress_empty_children (int sources[], dict_t *xattr[], + struct stat *buf, + int child_count, const char *key) +{ + int i = 0; + int32_t *pending = NULL; + int ret = 0; + int all_xattr_missing = 1; + + /* if the file was created by afr with xattrs */ + for (i = 0; i < child_count; i++) { + if (!xattr[i]) + continue; + + ret = dict_get_ptr (xattr[i], (char *)key, VOID(&pending)); + if (ret != 0) { + continue; + } + + all_xattr_missing = 0; + break; + } + + if (all_xattr_missing) { + /* supress 0byte files.. this avoids empty file created + by dir selfheal to overwrite the 'good' file */ + for (i = 0; i < child_count; i++) { + if (!buf[i].st_size) + sources[i] = 0; + } + goto out; + } + + + for (i = 0; i < child_count; i++) { + if (!xattr[i]) { + sources[i] = 0; + continue; + } + + ret = dict_get_ptr (xattr[i], (char *)key, VOID(&pending)); + if (ret != 0) { + sources[i] = 0; + continue; + } + + if (!pending) { + sources[i] = 0; + continue; + } + } + +out: + return 0; +} + + +void +afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) +{ + afr_private_t * priv = this->private; + + char *buf = NULL; + char *ptr = NULL; + + int i, j; + + /* 10 digits per entry + 1 space + '[' and ']' */ + buf = MALLOC (priv->child_count * 11 + 8); + + for (i = 0; i < priv->child_count; i++) { + ptr = buf; + ptr += sprintf (ptr, "[ "); + for (j = 0; j < priv->child_count; j++) { + ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); + } + ptr += sprintf (ptr, "]"); + gf_log (this->name, GF_LOG_DEBUG, + "pending_matrix: %s", buf); + } + + FREE (buf); +} + + +void +afr_sh_build_pending_matrix (int32_t *pending_matrix[], dict_t *xattr[], + int child_count, const char *key) +{ + int i = 0; + int j = 0; + int32_t *pending = NULL; + int ret = -1; + + /* start clean */ + for (i = 0; i < child_count; i++) { + for (j = 0; j < child_count; j++) { + pending_matrix[i][j] = 0; + } + } + + for (i = 0; i < child_count; i++) { + if (!xattr[i]) + continue; + + pending = NULL; + + ret = dict_get_ptr (xattr[i], (char *) key, + VOID(&pending)); + if (ret != 0) + continue; + + for (j = 0; j < child_count; j++) { + pending_matrix[i][j] = ntoh32 (pending[j]); + } + } +} + + +/** + * mark_sources: Mark all 'source' nodes and return number of source + * nodes found + */ + +int +afr_sh_mark_sources (int32_t *pending_matrix[], int sources[], int child_count) +{ + int i = 0; + int j = 0; + + int nsources = 0; + + + /* start clean */ + for (i = 0; i < child_count; i++) { + sources[i] = 0; + } + + /* + Let's 'normalize' the pending matrix first, + by disregarding all pending entries that refer + to themselves + */ + for (i = 0; i < child_count; i++) { + pending_matrix[i][i] = 0; + } + + for (i = 0; i < child_count; i++) { + for (j = 0; j < child_count; j++) { + if (pending_matrix[j][i]) + break; + } + + if (j == child_count) { + nsources++; + sources[i] = 1; + } + } + + return nsources; +} + + +void +afr_sh_pending_to_delta (int32_t *pending_matrix[], int32_t *delta_matrix[], + int success[], int child_count) +{ + int i = 0; + int j = 0; + + /* start clean */ + for (i = 0; i < child_count; i++) { + for (j = 0; j < child_count; j++) { + delta_matrix[i][j] = 0; + } + } + + for (i = 0; i < child_count; i++) { + for (j = 0; j < child_count; j++) { + if (!success[j]) + continue; + delta_matrix[i][j] = -pending_matrix[i][j]; + } + } +} + + +int +afr_sh_delta_to_xattr (int32_t *delta_matrix[], dict_t *xattr[], + int child_count, const char *key) +{ + int i = 0; + int j = 0; + + int ret = 0; + + int32_t *pending = 0; + + for (i = 0; i < child_count; i++) { + if (!xattr[i]) + continue; + + pending = CALLOC (sizeof (int32_t), child_count); + for (j = 0; j < child_count; j++) { + pending[j] = hton32 (delta_matrix[i][j]); + } + + ret = dict_set_bin (xattr[i], (char *) key, pending, + child_count * sizeof (int32_t)); + } + + return 0; +} + + +int +afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this) +{ + afr_private_t *priv = NULL; + int32_t *pending = NULL; + void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */ + + int ret = -1; + int i = 0; + + priv = this->private; + + ret = dict_get_ptr (xattr, AFR_METADATA_PENDING, &tmp_pending); + + if (ret != 0) + return 0; + + pending = tmp_pending; + for (i = 0; i < priv->child_count; i++) { + if (i == child_count) + continue; + if (pending[i]) + return 1; + } + + return 0; +} + + +int +afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this) +{ + afr_private_t *priv = NULL; + int32_t *pending = NULL; + void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */ + + int ret = -1; + int i = 0; + + priv = this->private; + + ret = dict_get_ptr (xattr, AFR_DATA_PENDING, &tmp_pending); + + if (ret != 0) + return 0; + + pending = tmp_pending; + for (i = 0; i < priv->child_count; i++) { + if (i == child_count) + continue; + if (pending[i]) + return 1; + } + + return 0; +} + + +int +afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this) +{ + afr_private_t *priv = NULL; + int32_t *pending = NULL; + void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */ + + int ret = -1; + int i = 0; + + priv = this->private; + + ret = dict_get_ptr (xattr, AFR_ENTRY_PENDING, &tmp_pending); + + if (ret != 0) + return 0; + + pending = tmp_pending; + for (i = 0; i < priv->child_count; i++) { + if (i == child_count) + continue; + if (pending[i]) + return 1; + } + + return 0; +} + + + +/** + * is_matrix_zero - return true if pending matrix is all zeroes + */ + +int +afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count) +{ + int i, j; + + for (i = 0; i < child_count; i++) + for (j = 0; j < child_count; j++) + if (pending_matrix[i][j]) + return 0; + return 1; +} + + +int +afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + +// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); + memset (sh->buf, 0, sizeof (struct stat) * priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) + dict_unref (sh->xattr[i]); + sh->xattr[i] = NULL; + } + + if (local->govinda_gOvinda) { + gf_log (this->name, GF_LOG_WARNING, + "aborting selfheal of %s", + local->loc.path); + sh->completion_cbk (frame, this); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to metadata check on %s", + local->loc.path); + afr_self_heal_metadata (frame, this); + } + + return 0; +} + + +int +sh_missing_entries_unlck_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_missing_entries_done (frame, this); + } + + return 0; +} + + +static int +sh_missing_entries_finish (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_self_heal_t *sh = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "unlocking %"PRId64"/%s on subvolume %s", + sh->parent_loc.inode->ino, local->loc.name, + priv->children[i]->name); + + STACK_WIND (frame, sh_missing_entries_unlck_cbk, + priv->children[i], + priv->children[i]->fops->entrylk, + &sh->parent_loc, local->loc.name, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + + if (!--call_count) + break; + } + } + return 0; +} + + +static int +sh_destroy_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int op_errno, struct stat *stbuf) +{ + STACK_DESTROY (frame->root); + return 0; +} + + +static int +sh_missing_entries_newentry_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *stbuf) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + call_frame_t *chown_frame = NULL; + int call_count = 0; + int child_index = 0; + struct stat *buf = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + buf = &sh->buf[sh->source]; + child_index = (long) cookie; + + if (op_ret == 0) { + chown_frame = copy_frame (frame); + + gf_log (this->name, GF_LOG_DEBUG, + "chown %s to %d %d on subvolume %s", + local->loc.path, buf->st_uid, buf->st_gid, + priv->children[child_index]->name); + + STACK_WIND (chown_frame, sh_destroy_cbk, + priv->children[child_index], + priv->children[child_index]->fops->chown, + &local->loc, + buf->st_uid, buf->st_gid); + } + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + sh_missing_entries_finish (frame, this); + } + + return 0; +} + + +static int +sh_missing_entries_mknod (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int enoent_count = 0; + int call_count = 0; + mode_t st_mode = 0; + dev_t st_dev = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) + if (sh->child_errno[i] == ENOENT) + enoent_count++; + + call_count = enoent_count; + local->call_count = call_count; + + st_mode = sh->buf[sh->source].st_mode; + st_dev = sh->buf[sh->source].st_dev; + + gf_log (this->name, GF_LOG_DEBUG, + "mknod %s mode 0%o on %d subvolumes", + local->loc.path, st_mode, enoent_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->child_errno[i] == ENOENT) { + STACK_WIND_COOKIE (frame, + sh_missing_entries_newentry_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->mknod, + &local->loc, st_mode, st_dev); + if (!--call_count) + break; + } + } + + return 0; +} + + +static int +sh_missing_entries_mkdir (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int enoent_count = 0; + int call_count = 0; + mode_t st_mode = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) + if (sh->child_errno[i] == ENOENT) + enoent_count++; + + call_count = enoent_count; + local->call_count = call_count; + + st_mode = sh->buf[sh->source].st_mode; + + gf_log (this->name, GF_LOG_DEBUG, + "mkdir %s mode 0%o on %d subvolumes", + local->loc.path, st_mode, enoent_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->child_errno[i] == ENOENT) { + STACK_WIND_COOKIE (frame, + sh_missing_entries_newentry_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->mkdir, + &local->loc, st_mode); + if (!--call_count) + break; + } + } + + return 0; +} + + +static int +sh_missing_entries_symlink (call_frame_t *frame, xlator_t *this, + const char *link) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int enoent_count = 0; + int call_count = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) + if (sh->child_errno[i] == ENOENT) + enoent_count++; + + call_count = enoent_count; + local->call_count = call_count; + + gf_log (this->name, GF_LOG_DEBUG, + "symlink %s -> %s on %d subvolumes", + local->loc.path, link, enoent_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->child_errno[i] == ENOENT) { + STACK_WIND_COOKIE (frame, + sh_missing_entries_newentry_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->symlink, + link, &local->loc); + if (!--call_count) + break; + } + } + + return 0; +} + + +static int +sh_missing_entries_readlink_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + const char *link) +{ + if (op_ret > 0) + sh_missing_entries_symlink (frame, this, link); + else + sh_missing_entries_finish (frame, this); + + return 0; +} + + +static int +sh_missing_entries_readlink (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + STACK_WIND (frame, sh_missing_entries_readlink_cbk, + priv->children[sh->source], + priv->children[sh->source]->fops->readlink, + &local->loc, 4096); + + return 0; +} + + +static int +sh_missing_entries_create (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int type = 0; + int i = 0; + afr_private_t *priv = NULL; + int enoent_count = 0; + int govinda_gOvinda = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (sh->child_errno[i]) { + if (sh->child_errno[i] == ENOENT) + enoent_count++; + } else { + if (type) { + if (type != (sh->buf[i].st_mode & S_IFMT)) + govinda_gOvinda = 1; + } else { + sh->source = i; + type = sh->buf[i].st_mode & S_IFMT; + } + } + } + + if (govinda_gOvinda) { + gf_log (this->name, GF_LOG_ERROR, + "conflicing filetypes exist for path %s. returning.", + local->loc.path); + + local->govinda_gOvinda = 1; + sh_missing_entries_finish (frame, this); + return 0; + } + + if (!type) { + gf_log (this->name, GF_LOG_ERROR, + "no source found for %s. all nodes down?. returning.", + local->loc.path); + /* subvolumes down and/or file does not exist */ + sh_missing_entries_finish (frame, this); + return 0; + } + + if (enoent_count == 0) { + gf_log (this->name, GF_LOG_ERROR, + "no missing files - %s. proceeding to metadata check", + local->loc.path); + /* proceed to next step - metadata self-heal */ + sh_missing_entries_finish (frame, this); + return 0; + } + + switch (type) { + case S_IFSOCK: + case S_IFREG: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + sh_missing_entries_mknod (frame, this); + break; + case S_IFLNK: + sh_missing_entries_readlink (frame, this); + break; + case S_IFDIR: + sh_missing_entries_mkdir (frame, this); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "unknown file type: 0%o", type); + local->govinda_gOvinda = 1; + sh_missing_entries_finish (frame, this); + } + + return 0; +} + + +static int +sh_missing_entries_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + int child_index = 0; + afr_local_t *local = NULL; + int call_count = 0; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "path %s on subvolume %s is of mode 0%o", + local->loc.path, + priv->children[child_index]->name, + buf->st_mode); + + local->self_heal.buf[child_index] = *buf; + } else { + gf_log (this->name, GF_LOG_WARNING, + "path %s on subvolume %s => -1 (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + + local->self_heal.child_errno[child_index] = op_errno; + } + + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + sh_missing_entries_create (frame, this); + } + + return 0; +} + + +static int +sh_missing_entries_lookup (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_private_t *priv = NULL; + dict_t *xattr_req = NULL; + int ret = -1; + + local = frame->local; + call_count = local->child_count; + priv = this->private; + + local->call_count = call_count; + + xattr_req = dict_new(); + + if (xattr_req) + ret = dict_set_uint64 (xattr_req, AFR_ENTRY_PENDING, + priv->child_count * sizeof(int32_t)); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, + sh_missing_entries_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, xattr_req); + + if (!--call_count) + break; + } + } + + if (xattr_req) + dict_unref (xattr_req); + + return 0; +} + + +static int +sh_missing_entries_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = (long) cookie; + + + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + sh->op_failed = 1; + + gf_log (this->name, + (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), + "locking inode of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "inode of %s on child %d locked", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed == 1) { + sh_missing_entries_finish (frame, this); + return 0; + } + + sh_missing_entries_lookup (frame, this); + } + + return 0; +} + + +static int +afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + gf_log (this->name, GF_LOG_DEBUG, + "attempting to recreate missing entries for path=%s", + local->loc.path); + + afr_build_parent_loc (&sh->parent_loc, &local->loc); + + call_count = local->child_count; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, sh_missing_entries_lk_cbk, + priv->children[i], + priv->children[i]->fops->entrylk, + &sh->parent_loc, local->loc.name, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_self_heal (call_frame_t *frame, xlator_t *this, + int (*completion_cbk) (call_frame_t *, xlator_t *)) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + gf_log (this->name, GF_LOG_DEBUG, + "performing self heal on %s (metadata=%d data=%d entry=%d)", + local->loc.path, + local->need_metadata_self_heal, + local->need_data_self_heal, + local->need_entry_self_heal); + + sh->completion_cbk = completion_cbk; + + sh->buf = CALLOC (priv->child_count, sizeof (struct stat)); + sh->child_errno = CALLOC (priv->child_count, sizeof (int)); + sh->success = CALLOC (priv->child_count, sizeof (int)); + sh->xattr = CALLOC (priv->child_count, sizeof (dict_t *)); + sh->sources = CALLOC (sizeof (*sh->sources), priv->child_count); + + sh->pending_matrix = CALLOC (sizeof (int32_t *), priv->child_count); + for (i = 0; i < priv->child_count; i++) { + sh->pending_matrix[i] = CALLOC (sizeof (int32_t), + priv->child_count); + } + + sh->delta_matrix = CALLOC (sizeof (int32_t *), priv->child_count); + for (i = 0; i < priv->child_count; i++) { + sh->delta_matrix[i] = CALLOC (sizeof (int32_t), + priv->child_count); + } + + if (local->success_count && local->enoent_count) { + afr_self_heal_missing_entries (frame, this); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to metadata check on %s", + local->loc.path); + afr_sh_missing_entries_done (frame, this); + } + + return 0; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h new file mode 100644 index 000000000..9dd597f07 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-common.h @@ -0,0 +1,66 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __AFR_SELF_HEAL_COMMON_H__ +#define __AFR_SELF_HEAL_COMMON_H__ + +#define FILE_HAS_HOLES(buf) (((buf)->st_size) > ((buf)->st_blocks * 512)) + +int +afr_sh_select_source (int sources[], int child_count); + +int +afr_sh_sink_count (int sources[], int child_count); + +int +afr_sh_source_count (int sources[], int child_count); + +int +afr_sh_supress_errenous_children (int sources[], int child_errno[], + int child_count); + +int +afr_sh_supress_empty_children (int sources[], dict_t *xattr[], + struct stat *buf, + int child_count, const char *key); + +void +afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this); + +void +afr_sh_build_pending_matrix (int32_t *pending_matrix[], dict_t *xattr[], + int child_count, const char *key); + +void +afr_sh_pending_to_delta (int32_t *pending_matrix[], int32_t *delta_matrix[], + int32_t success[], int child_count); + +int +afr_sh_mark_sources (int32_t *pending_matrix[], int sources[], + int child_count); + +int +afr_sh_delta_to_xattr (int32_t *delta_matrix[], dict_t *xattr[], + int child_count, const char *key); + +int +afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count); + + +#endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c new file mode 100644 index 000000000..3a48da485 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -0,0 +1,1030 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" + + + +int +afr_sh_data_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + /* + TODO: cleanup sh->* + */ + + gf_log (this->name, GF_LOG_DEBUG, + "self heal of %s completed", + local->loc.path); + + sh->completion_cbk (frame, this); + + return 0; +} + + +int +afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + fd_unref (sh->healing_fd); + sh->healing_fd = NULL; + afr_sh_data_done (frame, this); + } + + return 0; +} + + +int +afr_sh_data_close (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + int i = 0; + int call_count = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + if (!sh->healing_fd) { + afr_sh_data_done (frame, this); + return 0; + } + + call_count = sh->active_sinks + 1; + local->call_count = call_count; + + + /* closed source */ + gf_log (this->name, GF_LOG_DEBUG, + "closing fd of %s on %s", + local->loc.path, priv->children[sh->source]->name); + + STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, + (void *) (long) sh->source, + priv->children[sh->source], + priv->children[sh->source]->fops->flush, + sh->healing_fd); + call_count--; + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] || !local->child_up[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "closing fd of %s on %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->flush, + sh->healing_fd); + if (!--call_count) + break; + } + + return 0; +} + + +int +afr_sh_data_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + int call_count = 0; + int child_index = (long) cookie; + + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "locking inode of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "inode of %s on child %d locked", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_data_close (frame, this); + } + + return 0; +} + + +int +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this) +{ + struct flock flock; + int i = 0; + int call_count = 0; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t * sh = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + + local->call_count = call_count; + + flock.l_start = 0; + flock.l_len = 0; + flock.l_type = F_UNLCK; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "unlocking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_data_unlck_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->inodelk, + &local->loc, F_SETLK, &flock); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_sh_data_finish (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + gf_log (this->name, GF_LOG_DEBUG, + "finishing data selfheal of %s", local->loc.path); + + afr_sh_data_unlock (frame, this); + + return 0; +} + + +int +afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_data_finish (frame, this); + + return 0; +} + + +int +afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t **erase_xattr = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + + afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix, + sh->success, priv->child_count); + + erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) { + call_count++; + + erase_xattr[i] = get_new_dict(); + dict_ref (erase_xattr[i]); + } + } + + afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr, + priv->child_count, AFR_DATA_PENDING); + + local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (!erase_xattr[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "erasing pending flags from %s on %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_data_erase_pending_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, erase_xattr[i]); + if (!--call_count) + break; + } + + for (i = 0; i < priv->child_count; i++) { + if (erase_xattr[i]) { + dict_unref (erase_xattr[i]); + } + } + FREE (erase_xattr); + + return 0; +} + + +int +afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) + gf_log (this->name, GF_LOG_ERROR, + "ftruncate of %s on subvolume %s failed (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + else + gf_log (this->name, GF_LOG_DEBUG, + "ftruncate of %s on subvolume %s completed", + local->loc.path, + priv->children[child_index]->name); + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_data_erase_pending (frame, this); + } + + return 0; +} + + +int +afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + int *sources = NULL; + int call_count = 0; + int i = 0; + + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + sources = sh->sources; + call_count = sh->active_sinks; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (sources[i] || !local->child_up[i]) + continue; + + STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->ftruncate, + sh->healing_fd, sh->file_size); + + if (!--call_count) + break; + } + + return 0; +} + + +int +afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this); + +int +afr_sh_data_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + + int child_index = (long) cookie; + int call_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + gf_log (this->name, GF_LOG_DEBUG, + "wrote %d bytes of data from %s to child %d, offset %"PRId64"", + op_ret, local->loc.path, child_index, sh->offset - op_ret); + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "write to %s failed on subvolume %s (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + sh->op_failed = 1; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_data_read_write_iter (frame, this); + } + + return 0; +} + + +int +afr_sh_data_read_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + + int child_index = (long) cookie; + int i = 0; + int call_count = 0; + + off_t offset; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + call_count = sh->active_sinks; + + local->call_count = call_count; + + gf_log (this->name, GF_LOG_DEBUG, + "read %d bytes of data from %s on child %d, offset %"PRId64"", + op_ret, local->loc.path, child_index, sh->offset); + + if (op_ret <= 0) { + afr_sh_data_trim_sinks (frame, this); + return 0; + } + + /* what if we read less than block size? */ + offset = sh->offset; + sh->offset += op_ret; + + frame->root->req_refs = frame->root->rsp_refs; + + if (sh->file_has_holes) { + if (iov_0filled (vector, count) == 0) { + /* the iter function depends on the + sh->offset already being updated + above + */ + afr_sh_data_read_write_iter (frame, this); + goto out; + } + } + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] || !local->child_up[i]) + continue; + + /* this is a sink, so write to it */ + STACK_WIND_COOKIE (frame, afr_sh_data_write_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->writev, + sh->healing_fd, vector, count, offset); + + if (!--call_count) + break; + } + +out: + return 0; +} + + +int +afr_sh_data_read_write (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + STACK_WIND_COOKIE (frame, afr_sh_data_read_cbk, + (void *) (long) sh->source, + priv->children[sh->source], + priv->children[sh->source]->fops->readv, + sh->healing_fd, sh->block_size, + sh->offset); + + return 0; +} + + +int +afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + if (sh->op_failed) { + afr_sh_data_finish (frame, this); + goto out; + } + + if (sh->offset >= sh->file_size) { + gf_log (this->name, GF_LOG_DEBUG, + "closing fd's of %s", + local->loc.path); + afr_sh_data_trim_sinks (frame, this); + + goto out; + } + + afr_sh_data_read_write (frame, this); + +out: + return 0; +} + + +int +afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + child_index = (long) cookie; + + /* TODO: some of the open's might fail. + In that case, modify cleanup fn to send flush on those + fd's which are already open */ + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "open of %s failed on child %s (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + sh->op_failed = 1; + } + + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed) { + afr_sh_data_finish (frame, this); + return 0; + } + gf_log (this->name, GF_LOG_DEBUG, + "fd for %s opened, commencing sync", + local->loc.path); + + gf_log (this->name, GF_LOG_WARNING, + "sourcing file %s from %s to other sinks", + local->loc.path, priv->children[sh->source]->name); + + afr_sh_data_read_write (frame, this); + } + + return 0; +} + + +int +afr_sh_data_open (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int call_count = 0; + + int source = -1; + int *sources = NULL; + + fd_t *fd = NULL; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = sh->active_sinks + 1; + local->call_count = call_count; + + fd = fd_create (local->loc.inode, frame->root->pid); + sh->healing_fd = fd; + + source = local->self_heal.source; + sources = local->self_heal.sources; + + sh->block_size = 65536; + sh->file_size = sh->buf[source].st_size; + + if (FILE_HAS_HOLES (&sh->buf[source])) + sh->file_has_holes = 1; + + /* open source */ + STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, + (void *) (long) source, + priv->children[source], + priv->children[source]->fops->open, + &local->loc, O_RDONLY|O_LARGEFILE, fd); + call_count--; + + /* open sinks */ + for (i = 0; i < priv->child_count; i++) { + if(sources[i] || !local->child_up[i]) + continue; + + STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->open, + &local->loc, + O_WRONLY|O_LARGEFILE, fd); + + if (!--call_count) + break; + } + + return 0; +} + + +int +afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int active_sinks = 0; + int source = 0; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] == 0 && local->child_up[i] == 1) { + active_sinks++; + sh->success[i] = 1; + } + } + sh->success[source] = 1; + + if (active_sinks == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "no active sinks for performing self-heal on file %s", + local->loc.path); + afr_sh_data_finish (frame, this); + return 0; + } + sh->active_sinks = active_sinks; + + gf_log (this->name, GF_LOG_DEBUG, + "syncing data of %s from subvolume %s to %d active sinks", + local->loc.path, priv->children[source]->name, active_sinks); + + afr_sh_data_open (frame, this); + + return 0; +} + + +int +afr_sh_data_fix (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int nsources = 0; + int source = 0; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr, + priv->child_count, AFR_DATA_PENDING); + + afr_sh_print_pending_matrix (sh->pending_matrix, this); + + + afr_sh_mark_sources (sh->pending_matrix, sh->sources, + priv->child_count); + + afr_sh_supress_empty_children (sh->sources, sh->xattr, sh->buf, + priv->child_count, AFR_DATA_PENDING); + + afr_sh_supress_errenous_children (sh->sources, sh->child_errno, + priv->child_count); + + nsources = afr_sh_source_count (sh->sources, priv->child_count); + + if ((nsources == 0) + && (priv->favorite_child != -1) + && (sh->child_errno[priv->favorite_child] == 0)) { + + gf_log (this->name, GF_LOG_WARNING, + "Picking favorite child %s as authentic source to resolve conflicting data of %s", + priv->children[priv->favorite_child]->name, + local->loc.path); + + sh->sources[priv->favorite_child] = 1; + + nsources = afr_sh_source_count (sh->sources, + priv->child_count); + } + + if (nsources == 0) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to resolve conflicting data of %s. " + "Please resolve manually by deleting the file %s " + "from all but the preferred subvolume. " + "Please consider 'option favorite-child <>'", + local->loc.path, local->loc.path); + + local->govinda_gOvinda = 1; + + afr_sh_data_finish (frame, this); + return 0; + } + + source = afr_sh_select_source (sh->sources, priv->child_count); + sh->source = source; + + /* detect changes not visible through pending flags -- JIC */ + for (i = 0; i < priv->child_count; i++) { + if (i == source || sh->child_errno[i]) + continue; + + if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[source])) + sh->sources[i] = 0; + } + + afr_sh_data_sync_prepare (frame, this); + + return 0; +} + + +int +afr_sh_data_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + int call_count = -1; + int child_index = (long) cookie; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + if (op_ret != -1) { + sh->xattr[child_index] = dict_ref (xattr); + sh->buf[child_index] = *buf; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_data_fix (frame, this); + } + + return 0; +} + + +int +afr_sh_data_lookup (call_frame_t *frame, xlator_t *this) +{ + afr_self_heal_t *sh = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xattr_req = NULL; + + int call_count = 0; + int i = 0; + int ret = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + call_count = local->child_count; + + local->call_count = call_count; + + xattr_req = dict_new(); + if (xattr_req) + ret = dict_set_uint64 (xattr_req, AFR_DATA_PENDING, + priv->child_count * sizeof(int32_t)); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_sh_data_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, xattr_req); + if (!--call_count) + break; + } + } + + if (xattr_req) + dict_unref (xattr_req); + + return 0; +} + + +int +afr_sh_data_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = (long) cookie; + + /* TODO: what if lock fails? */ + + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + sh->op_failed = 1; + + gf_log (this->name, + (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), + "locking of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "inode of %s on child %d locked", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed) { + afr_sh_data_finish (frame, this); + return 0; + } + + afr_sh_data_lookup (frame, this); + } + + return 0; +} + + +int +afr_sh_data_lock (call_frame_t *frame, xlator_t *this) +{ + struct flock flock; + int i = 0; + int call_count = 0; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t * sh = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + + local->call_count = call_count; + + flock.l_start = 0; + flock.l_len = 0; + flock.l_type = F_WRLCK; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "locking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_data_lock_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->inodelk, + &local->loc, F_SETLK, &flock); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_self_heal_data (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = this->private; + + + local = frame->local; + sh = &local->self_heal; + + if (local->need_data_self_heal && priv->data_self_heal) { + afr_sh_data_lock (frame, this); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "not doing data self heal on %s", + local->loc.path); + afr_sh_data_done (frame, this); + } + + return 0; +} + diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c new file mode 100644 index 000000000..ec341922e --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -0,0 +1,2038 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" + + + +int +afr_sh_entry_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + /* + TODO: cleanup sh->* + */ + + gf_log (this->name, GF_LOG_DEBUG, + "self heal of %s completed", + local->loc.path); + + sh->completion_cbk (frame, this); + + return 0; +} + + +int +afr_sh_entry_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = (long) cookie; + + /* TODO: what if lock fails? */ + + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "unlocking inode of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "unlocked inode of %s on child %d", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->healing_fd) + fd_unref (sh->healing_fd); + sh->healing_fd = NULL; + afr_sh_entry_done (frame, this); + } + + return 0; +} + + +int +afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int call_count = 0; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t * sh = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "unlocking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_entry_unlck_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->entrylk, + &local->loc, NULL, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_sh_entry_finish (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + gf_log (this->name, GF_LOG_DEBUG, + "finishing entry selfheal of %s", local->loc.path); + + afr_sh_entry_unlock (frame, this); + + return 0; +} + + +int +afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_entry_finish (frame, this); + + return 0; +} + + +int +afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t **erase_xattr = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + + afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix, + sh->success, priv->child_count); + + erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) { + call_count++; + + erase_xattr[i] = get_new_dict(); + dict_ref (erase_xattr[i]); + } + } + + afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr, + priv->child_count, AFR_ENTRY_PENDING); + + local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (!erase_xattr[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "erasing pending flags from %s on %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_entry_erase_pending_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, erase_xattr[i]); + if (!--call_count) + break; + } + + for (i = 0; i < priv->child_count; i++) { + if (erase_xattr[i]) { + dict_unref (erase_xattr[i]); + } + } + FREE (erase_xattr); + + return 0; +} + + + +static int +next_active_source (call_frame_t *frame, xlator_t *this, + int current_active_source) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int source = -1; + int next_active_source = -1; + int i = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + source = sh->source; + + if (source != -1) { + if (current_active_source != source) + next_active_source = source; + goto out; + } + + /* + the next active sink becomes the source for the + 'conservative decision' of merging all entries + */ + + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) + && (local->child_up[i] == 1) + && (i > current_active_source)) { + + next_active_source = i; + break; + } + } +out: + return next_active_source; +} + + + +static int +next_active_sink (call_frame_t *frame, xlator_t *this, + int current_active_sink) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int next_active_sink = -1; + int i = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + /* + the next active sink becomes the source for the + 'conservative decision' of merging all entries + */ + + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) + && (local->child_up[i] == 1) + && (i > current_active_sink)) { + + next_active_sink = i; + break; + } + } + + return next_active_sink; +} + + +int +build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +{ + int ret = -1; + + if (!child) { + goto out; + } + + if (strcmp (parent->path, "/") == 0) + asprintf ((char **)&child->path, "/%s", name); + else + asprintf ((char **)&child->path, "%s/%s", parent->path, name); + + if (!child->path) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + child->name = strrchr (child->path, '/'); + if (child->name) + child->name++; + + child->parent = inode_ref (parent->inode); + child->inode = inode_new (parent->inode->table); + + if (!child->inode) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ret = 0; +out: + if (ret == -1) + loc_wipe (child); + + return ret; +} + + +int +afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this); + +int +afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, + int active_src); + +int +afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_entry_expunge_subvol (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + int active_src = 0; + call_frame_t *frame = NULL; + + + priv = this->private; + expunge_local = expunge_frame->local; + expunge_sh = &expunge_local->self_heal; + frame = expunge_sh->sh_frame; + + active_src = (long) cookie; + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "removed %s on %s", + expunge_local->loc.path, + priv->children[active_src]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, + "removing %s on %s failed (%s)", + expunge_local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + } + + AFR_STACK_DESTROY (expunge_frame); + afr_sh_entry_expunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + + priv = this->private; + expunge_local = expunge_frame->local; + + gf_log (this->name, GF_LOG_WARNING, + "removing directory %s on %s", + expunge_local->loc.path, priv->children[active_src]->name); + + STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, + (void *) (long) active_src, + priv->children[active_src], + priv->children[active_src]->fops->rmdir, + &expunge_local->loc); + + return 0; +} + + +int +afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + + priv = this->private; + expunge_local = expunge_frame->local; + + gf_log (this->name, GF_LOG_WARNING, + "unlinking file %s on %s", + expunge_local->loc.path, priv->children[active_src]->name); + + STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, + (void *) (long) active_src, + priv->children[active_src], + priv->children[active_src]->fops->unlink, + &expunge_local->loc); + + return 0; +} + + +int +afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, + int active_src, struct stat *buf) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + int source = 0; + call_frame_t *frame = NULL; + int type = 0; + + priv = this->private; + expunge_local = expunge_frame->local; + expunge_sh = &expunge_local->self_heal; + frame = expunge_sh->sh_frame; + source = expunge_sh->source; + + type = (buf->st_mode & S_IFMT); + + switch (type) { + case S_IFSOCK: + case S_IFREG: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + case S_IFLNK: + afr_sh_entry_expunge_unlink (expunge_frame, this, active_src); + + break; + case S_IFDIR: + afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "%s has unknown file type on %s: 0%o", + expunge_local->loc.path, + priv->children[source]->name, type); + goto out; + break; + } + + return 0; +out: + AFR_STACK_DESTROY (expunge_frame); + afr_sh_entry_expunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *x) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + call_frame_t *frame = NULL; + int active_src = 0; + + priv = this->private; + expunge_local = expunge_frame->local; + expunge_sh = &expunge_local->self_heal; + frame = expunge_sh->sh_frame; + active_src = (long) cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "lookup of %s on %s failed (%s)", + expunge_local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + goto out; + } + + afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf); + + return 0; +out: + AFR_STACK_DESTROY (expunge_frame); + afr_sh_entry_expunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + + priv = this->private; + expunge_local = expunge_frame->local; + + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s on %s", + expunge_local->loc.path, priv->children[active_src]->name); + + STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk, + (void *) (long) active_src, + priv->children[active_src], + priv->children[active_src]->fops->lookup, + &expunge_local->loc, 0); + + return 0; +} + + +int +afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *x) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + int source = 0; + call_frame_t *frame = NULL; + int active_src = 0; + + + priv = this->private; + expunge_local = expunge_frame->local; + expunge_sh = &expunge_local->self_heal; + frame = expunge_sh->sh_frame; + active_src = expunge_sh->active_source; + source = (long) cookie; + + if (op_ret == -1 && op_errno == ENOENT) { + + gf_log (this->name, GF_LOG_DEBUG, + "missing entry %s on %s", + expunge_local->loc.path, + priv->children[source]->name); + + afr_sh_entry_expunge_purge (expunge_frame, this, active_src); + + return 0; + } + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "%s exists under %s", + expunge_local->loc.path, + priv->children[source]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, + "looking up %s under %s failed (%s)", + expunge_local->loc.path, + priv->children[source]->name, + strerror (op_errno)); + } + + AFR_STACK_DESTROY (expunge_frame); + afr_sh_entry_expunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, + char *name) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int ret = -1; + call_frame_t *expunge_frame = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + int active_src = 0; + int source = 0; + int op_errno = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + active_src = sh->active_source; + source = sh->source; + + if ((strcmp (name, ".") == 0) + || (strcmp (name, "..") == 0)) { + gf_log (this->name, GF_LOG_DEBUG, + "skipping inspection of %s under %s", + name, local->loc.path); + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "inspecting existance of %s under %s", + name, local->loc.path); + + expunge_frame = copy_frame (frame); + if (!expunge_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (expunge_local, afr_local_t, out); + + expunge_frame->local = expunge_local; + expunge_sh = &expunge_local->self_heal; + expunge_sh->sh_frame = frame; + expunge_sh->active_source = active_src; + + ret = build_child_loc (this, &expunge_local->loc, &local->loc, name); + if (ret != 0) { + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s on %s", expunge_local->loc.path, + priv->children[source]->name); + + STACK_WIND_COOKIE (expunge_frame, + afr_sh_entry_expunge_entry_cbk, + (void *) (long) source, + priv->children[source], + priv->children[source]->fops->lookup, + &expunge_local->loc, 0); + + ret = 0; +out: + if (ret == -1) + afr_sh_entry_expunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + gf_dirent_t *entries) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + gf_dirent_t *entry = NULL; + off_t last_offset = 0; + int active_src = 0; + int entry_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + active_src = sh->active_source; + + if (op_ret <= 0) { + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "readdir of %s on subvolume %s failed (%s)", + local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "readdir of %s on subvolume %s complete", + local->loc.path, + priv->children[active_src]->name); + } + + afr_sh_entry_expunge_all (frame, this); + return 0; + } + + list_for_each_entry (entry, &entries->list, list) { + last_offset = entry->d_off; + entry_count++; + } + + gf_log (this->name, GF_LOG_DEBUG, + "readdir'ed %d entries from %s", + entry_count, priv->children[active_src]->name); + + sh->offset = last_offset; + local->call_count = entry_count; + + list_for_each_entry (entry, &entries->list, list) { + afr_sh_entry_expunge_entry (frame, this, entry->d_name); + } + + return 0; +} + +int +afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk, + priv->children[active_src], + priv->children[active_src]->fops->readdir, + sh->healing_fd, sh->block_size, sh->offset); + + return 0; +} + + +int +afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int active_src = -1; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + sh->offset = 0; + + if (sh->source == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "no active sources for %s to expunge entries", + local->loc.path); + goto out; + } + + active_src = next_active_sink (frame, this, sh->active_source); + sh->active_source = active_src; + + if (sh->op_failed) { + goto out; + } + + if (active_src == -1) { + /* completed creating missing files on all subvolumes */ + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "expunging entries of %s on %s to other sinks", + local->loc.path, priv->children[active_src]->name); + + afr_sh_entry_expunge_subvol (frame, this, active_src); + + return 0; +out: + afr_sh_entry_erase_pending (frame, this); + return 0; + +} + + +int +afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this); + +int +afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this, + int active_src); + +int +afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_entry_impunge_subvol (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_impunge_utimens_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, struct stat *stbuf) +{ + int call_count = 0; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + call_frame_t *frame = NULL; + int active_src = 0; + int child_index = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + child_index = (long) cookie; + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "utimes set for %s on %s", + impunge_local->loc.path, + priv->children[child_index]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, + "setting utimes of %s on %s failed (%s)", + impunge_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + } + + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_chown_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, struct stat *stbuf) +{ + int call_count = 0; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + call_frame_t *frame = NULL; + int active_src = 0; + int child_index = 0; + struct timespec ts[2]; + + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + child_index = (long) cookie; + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "ownership of %s on %s changed", + impunge_local->loc.path, + priv->children[child_index]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, + "setting ownership of %s on %s failed (%s)", + impunge_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + goto out; + } + +#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC + ts[0] = impunge_local->cont.lookup.buf.st_atim; + ts[1] = impunge_local->cont.lookup.buf.st_mtim; +#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC + ts[0] = impunge_local->cont.lookup.buf.st_atimespec; + ts[1] = impunge_local->cont.lookup.buf.st_mtimespec; +#else + ts[0].tv_sec = impunge_local->cont.lookup.buf.st_atime; + ts[1].tv_sec = impunge_local->cont.lookup.buf.st_mtime; +#endif + STACK_WIND_COOKIE (impunge_frame, + afr_sh_entry_impunge_utimens_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->utimens, + &impunge_local->loc, ts); + + return 0; + +out: + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *stbuf) +{ + int call_count = 0; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + call_frame_t *frame = NULL; + int active_src = 0; + int child_index = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + + child_index = (long) cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "creation of %s on %s failed (%s)", + impunge_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "setting ownership of %s on %s to %d/%d", + impunge_local->loc.path, + priv->children[child_index]->name, + impunge_local->cont.lookup.buf.st_uid, + impunge_local->cont.lookup.buf.st_gid); + + inode->st_mode = stbuf->st_mode; + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_chown_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->chown, + &impunge_local->loc, + impunge_local->cont.lookup.buf.st_uid, + impunge_local->cont.lookup.buf.st_gid); + return 0; + +out: + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, + int child_index, struct stat *stbuf) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + + gf_log (this->name, GF_LOG_WARNING, + "creating file %s mode=0%o dev=0x%"GF_PRI_DEV" on %s", + impunge_local->loc.path, + stbuf->st_mode, stbuf->st_rdev, + priv->children[child_index]->name); + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->mknod, + &impunge_local->loc, + stbuf->st_mode, stbuf->st_rdev); + + return 0; +} + + + +int +afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this, + int child_index, struct stat *stbuf) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + + gf_log (this->name, GF_LOG_WARNING, + "creating directory %s mode=0%o on %s", + impunge_local->loc.path, + stbuf->st_mode, + priv->children[child_index]->name); + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->mkdir, + &impunge_local->loc, stbuf->st_mode); + + return 0; +} + + +int +afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this, + int child_index, const char *linkname) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + + gf_log (this->name, GF_LOG_WARNING, + "creating symlink %s -> %s on %s", + impunge_local->loc.path, linkname, + priv->children[child_index]->name); + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->symlink, + linkname, &impunge_local->loc); + + return 0; +} + + +int +afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + const char *linkname) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int child_index = -1; + call_frame_t *frame = NULL; + int call_count = -1; + int active_src = -1; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + active_src = impunge_sh->active_source; + + child_index = (long) cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "readlink of %s on %s failed (%s)", + impunge_local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + goto out; + } + + afr_sh_entry_impunge_symlink (impunge_frame, this, child_index, + linkname); + return 0; + +out: + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this, + int child_index, struct stat *stbuf) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int active_src = -1; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + active_src = impunge_sh->active_source; + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk, + (void *) (long) child_index, + priv->children[active_src], + priv->children[active_src]->fops->readlink, + &impunge_local->loc, 4096); + + return 0; +} + + +int +afr_sh_entry_impunge_recreate_lookup_cbk (call_frame_t *impunge_frame, + void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, + dict_t *xattr) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int active_src = 0; + int type = 0; + int child_index = 0; + call_frame_t *frame = NULL; + int call_count = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + + child_index = (long) cookie; + + active_src = impunge_sh->active_source; + + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "looking up %s on %s (for %s) failed (%s)", + impunge_local->loc.path, + priv->children[active_src]->name, + priv->children[child_index]->name, + strerror (op_errno)); + goto out; + } + + impunge_local->cont.lookup.buf = *buf; + type = (buf->st_mode & S_IFMT); + + switch (type) { + case S_IFSOCK: + case S_IFREG: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + afr_sh_entry_impunge_mknod (impunge_frame, this, + child_index, buf); + break; + case S_IFLNK: + afr_sh_entry_impunge_readlink (impunge_frame, this, + child_index, buf); + break; + case S_IFDIR: + afr_sh_entry_impunge_mkdir (impunge_frame, this, + child_index, buf); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "%s has unknown file type on %s: 0%o", + impunge_local->loc.path, + priv->children[active_src]->name, type); + goto out; + break; + } + + return 0; + +out: + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_recreate (call_frame_t *impunge_frame, xlator_t *this, + int child_index) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int active_src = 0; + + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + + active_src = impunge_sh->active_source; + + STACK_WIND_COOKIE (impunge_frame, + afr_sh_entry_impunge_recreate_lookup_cbk, + (void *) (long) child_index, + priv->children[active_src], + priv->children[active_src]->fops->lookup, + &impunge_local->loc, 0); + + return 0; +} + + +int +afr_sh_entry_impunge_entry_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *x) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int call_count = 0; + int child_index = 0; + call_frame_t *frame = NULL; + int active_src = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + child_index = (long) cookie; + active_src = impunge_sh->active_source; + + if (op_ret == -1 && op_errno == ENOENT) { + /* decrease call_count in recreate-callback */ + gf_log (this->name, GF_LOG_DEBUG, + "missing entry %s on %s", + impunge_local->loc.path, + priv->children[child_index]->name); + + afr_sh_entry_impunge_recreate (impunge_frame, this, + child_index); + return 0; + } + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "%s exists under %s", + impunge_local->loc.path, + priv->children[child_index]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, + "looking up %s under %s failed (%s)", + impunge_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + } + + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this, + char *name) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int ret = -1; + call_frame_t *impunge_frame = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int active_src = 0; + int i = 0; + int call_count = 0; + int op_errno = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + active_src = sh->active_source; + + if ((strcmp (name, ".") == 0) + || (strcmp (name, "..") == 0)) { + gf_log (this->name, GF_LOG_DEBUG, + "skipping inspection of %s under %s", + name, local->loc.path); + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "inspecting existance of %s under %s", + name, local->loc.path); + + impunge_frame = copy_frame (frame); + if (!impunge_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (impunge_local, afr_local_t, out); + + impunge_frame->local = impunge_local; + impunge_sh = &impunge_local->self_heal; + impunge_sh->sh_frame = frame; + impunge_sh->active_source = active_src; + + ret = build_child_loc (this, &impunge_local->loc, &local->loc, name); + if (ret != 0) { + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (i == active_src) + continue; + if (local->child_up[i] == 0) + continue; + if (sh->sources[i] == 1) + continue; + call_count++; + } + + impunge_local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (i == active_src) + continue; + if (local->child_up[i] == 0) + continue; + if (sh->sources[i] == 1) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s on %s", impunge_local->loc.path, + priv->children[i]->name); + + STACK_WIND_COOKIE (impunge_frame, + afr_sh_entry_impunge_entry_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &impunge_local->loc, 0); + + if (!--call_count) + break; + } + + ret = 0; +out: + if (ret == -1) + afr_sh_entry_impunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + gf_dirent_t *entries) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + gf_dirent_t *entry = NULL; + off_t last_offset = 0; + int active_src = 0; + int entry_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + active_src = sh->active_source; + + if (op_ret <= 0) { + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "readdir of %s on subvolume %s failed (%s)", + local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "readdir of %s on subvolume %s complete", + local->loc.path, + priv->children[active_src]->name); + } + + afr_sh_entry_impunge_all (frame, this); + return 0; + } + + list_for_each_entry (entry, &entries->list, list) { + last_offset = entry->d_off; + entry_count++; + } + + gf_log (this->name, GF_LOG_DEBUG, + "readdir'ed %d entries from %s", + entry_count, priv->children[active_src]->name); + + sh->offset = last_offset; + local->call_count = entry_count; + + list_for_each_entry (entry, &entries->list, list) { + afr_sh_entry_impunge_entry (frame, this, entry->d_name); + } + + return 0; +} + + +int +afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk, + priv->children[active_src], + priv->children[active_src]->fops->readdir, + sh->healing_fd, sh->block_size, sh->offset); + + return 0; +} + + +int +afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int active_src = -1; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + sh->offset = 0; + + active_src = next_active_source (frame, this, sh->active_source); + sh->active_source = active_src; + + if (sh->op_failed) { + afr_sh_entry_finish (frame, this); + return 0; + } + + if (active_src == -1) { + /* completed creating missing files on all subvolumes */ + afr_sh_entry_expunge_all (frame, this); + return 0; + } + + gf_log (this->name, GF_LOG_DEBUG, + "impunging entries of %s on %s to other sinks", + local->loc.path, priv->children[active_src]->name); + + afr_sh_entry_impunge_subvol (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + child_index = (long) cookie; + + /* TODO: some of the open's might fail. + In that case, modify cleanup fn to send flush on those + fd's which are already open */ + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "opendir of %s failed on child %s (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + sh->op_failed = 1; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed) { + afr_sh_entry_finish (frame, this); + return 0; + } + gf_log (this->name, GF_LOG_DEBUG, + "fd for %s opened, commencing sync", + local->loc.path); + + sh->active_source = -1; + afr_sh_entry_impunge_all (frame, this); + } + + return 0; +} + + +int +afr_sh_entry_open (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int call_count = 0; + + int source = -1; + int *sources = NULL; + + fd_t *fd = NULL; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = local->self_heal.source; + sources = local->self_heal.sources; + + sh->block_size = 131072; + sh->offset = 0; + + call_count = sh->active_sinks; + if (source != -1) + call_count++; + + local->call_count = call_count; + + fd = fd_create (local->loc.inode, frame->root->pid); + sh->healing_fd = fd; + + if (source != -1) { + gf_log (this->name, GF_LOG_DEBUG, + "opening directory %s on subvolume %s (source)", + local->loc.path, priv->children[source]->name); + + /* open source */ + STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, + (void *) (long) source, + priv->children[source], + priv->children[source]->fops->opendir, + &local->loc, fd); + call_count--; + } + + /* open sinks */ + for (i = 0; i < priv->child_count; i++) { + if (sources[i] || !local->child_up[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "opening directory %s on subvolume %s (sink)", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->opendir, + &local->loc, fd); + + if (!--call_count) + break; + } + + return 0; +} + + +int +afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int active_sinks = 0; + int source = 0; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] == 0 && local->child_up[i] == 1) { + active_sinks++; + sh->success[i] = 1; + } + } + if (source != -1) + sh->success[source] = 1; + + if (active_sinks == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "no active sinks for self-heal on dir %s", + local->loc.path); + afr_sh_entry_finish (frame, this); + return 0; + } + if (source == -1 && active_sinks < 2) { + gf_log (this->name, GF_LOG_WARNING, + "cannot sync with 0 sources and 1 sink on dir %s", + local->loc.path); + afr_sh_entry_finish (frame, this); + return 0; + } + sh->active_sinks = active_sinks; + + if (source != -1) + gf_log (this->name, GF_LOG_DEBUG, + "syncing %s from subvolume %s to %d active sinks", + local->loc.path, priv->children[source]->name, + active_sinks); + else + gf_log (this->name, GF_LOG_DEBUG, + "no active sources for %s found. " + "merging all entries as a conservative decision", + local->loc.path); + + afr_sh_entry_open (frame, this); + + return 0; +} + + +int +afr_sh_entry_fix (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int source = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr, + priv->child_count, AFR_ENTRY_PENDING); + + afr_sh_print_pending_matrix (sh->pending_matrix, this); + + + afr_sh_mark_sources (sh->pending_matrix, sh->sources, + priv->child_count); + + afr_sh_supress_errenous_children (sh->sources, sh->child_errno, + priv->child_count); + + source = afr_sh_select_source (sh->sources, priv->child_count); + sh->source = source; + + afr_sh_entry_sync_prepare (frame, this); + + return 0; +} + + + +int +afr_sh_entry_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + int call_count = -1; + int child_index = (long) cookie; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + if (op_ret != -1) { + sh->xattr[child_index] = dict_ref (xattr); + sh->buf[child_index] = *buf; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_entry_fix (frame, this); + } + + return 0; +} + + + +int +afr_sh_entry_lookup (call_frame_t *frame, xlator_t *this) +{ + afr_self_heal_t * sh = NULL; + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + dict_t *xattr_req = NULL; + int ret = 0; + int call_count = 0; + int i = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + call_count = local->child_count; + + local->call_count = call_count; + + xattr_req = dict_new(); + if (xattr_req) + ret = dict_set_uint64 (xattr_req, AFR_ENTRY_PENDING, + priv->child_count * sizeof(int32_t)); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, + afr_sh_entry_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, xattr_req); + if (!--call_count) + break; + } + } + + if (xattr_req) + dict_unref (xattr_req); + + return 0; +} + + + +int +afr_sh_entry_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = (long) cookie; + + /* TODO: what if lock fails? */ + + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + sh->op_failed = 1; + + gf_log (this->name, + (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), + "locking inode of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "inode of %s on child %d locked", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed == 1) { + afr_sh_entry_finish (frame, this); + return 0; + } + + afr_sh_entry_lookup (frame, this); + } + + return 0; +} + + +int +afr_sh_entry_lock (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int call_count = 0; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t * sh = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "locking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_entry_lock_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->entrylk, + &local->loc, NULL, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_self_heal_entry (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + if (local->need_entry_self_heal && priv->entry_self_heal) { + afr_sh_entry_lock (frame, this); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to completion on %s", + local->loc.path); + afr_sh_entry_done (frame, this); + } + + return 0; +} + diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c new file mode 100644 index 000000000..e65a426db --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -0,0 +1,791 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" + + +int +afr_sh_metadata_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + +// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); + memset (sh->buf, 0, sizeof (struct stat) * priv->child_count); + memset (sh->success, 0, sizeof (int) * priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) + dict_unref (sh->xattr[i]); + sh->xattr[i] = NULL; + } + + if (local->govinda_gOvinda) { + gf_log (this->name, GF_LOG_WARNING, + "aborting selfheal of %s", + local->loc.path); + sh->completion_cbk (frame, this); + } else { + if (S_ISREG (local->cont.lookup.buf.st_mode)) { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to data check on %s", + local->loc.path); + afr_self_heal_data (frame, this); + return 0; + } + + if (S_ISDIR (local->cont.lookup.buf.st_mode)) { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to entry check on %s", + local->loc.path); + afr_self_heal_entry (frame, this); + return 0; + } + gf_log (this->name, GF_LOG_DEBUG, + "completed self heal of %s", + local->loc.path); + + sh->completion_cbk (frame, this); + } + + return 0; +} + + +int +afr_sh_metadata_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + int call_count = 0; + + + local = frame->local; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_metadata_done (frame, this); + + return 0; +} + + +int +afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + struct flock flock = {0, }; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + flock.l_start = 0; + flock.l_len = 0; + flock.l_type = F_UNLCK; + + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "unlocking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND (frame, afr_sh_metadata_unlck_cbk, + priv->children[i], + priv->children[i]->fops->inodelk, + &local->loc, F_SETLK, &flock); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_metadata_finish (frame, this); + + return 0; +} + + +int +afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t **erase_xattr = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + + afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix, + sh->success, priv->child_count); + + erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) { + call_count++; + + erase_xattr[i] = get_new_dict(); + dict_ref (erase_xattr[i]); + } + } + + afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr, + priv->child_count, AFR_METADATA_PENDING); + + local->call_count = call_count; + + if (call_count == 0) { + gf_log (this->name, GF_LOG_WARNING, + "metadata of %s not healed on any subvolume", + local->loc.path); + + afr_sh_metadata_finish (frame, this); + } + + for (i = 0; i < priv->child_count; i++) { + if (!erase_xattr[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "erasing pending flags from %s on %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_erase_pending_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, erase_xattr[i]); + if (!--call_count) + break; + } + + for (i = 0; i < priv->child_count; i++) { + if (erase_xattr[i]) { + dict_unref (erase_xattr[i]); + } + } + FREE (erase_xattr); + + return 0; +} + + +int +afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "setting attributes failed for %s on %s (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + + sh->success[child_index] = 0; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_metadata_erase_pending (frame, this); + + return 0; +} + + +int +afr_sh_metadata_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); + + return 0; +} + + +int +afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); + + return 0; +} + + +int +afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int source = 0; + int active_sinks = 0; + int call_count = 0; + int i = 0; + struct timespec ts[2]; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + active_sinks = sh->active_sinks; + + /* + * 4 calls per sink - chown, chmod, utimes, setxattr + */ + if (xattr) + call_count = active_sinks * 4; + else + call_count = active_sinks * 3; + + local->call_count = call_count; + +#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC + ts[0] = sh->buf[source].st_atim; + ts[1] = sh->buf[source].st_mtim; +#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC + ts[0] = sh->buf[source].st_atimespec; + ts[1] = sh->buf[source].st_mtimespec; +#else + ts[0].tv_sec = sh->buf[source].st_atime; + ts[1].tv_sec = sh->buf[source].st_mtime; +#endif + + for (i = 0; i < priv->child_count; i++) { + if (call_count == 0) { + break; + } + if (sh->sources[i] || !local->child_up[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "syncing metadata of %s from %s to %s", + local->loc.path, priv->children[source]->name, + priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->chown, + &local->loc, + sh->buf[source].st_uid, + sh->buf[source].st_gid); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->chmod, + &local->loc, sh->buf[source].st_mode); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->utimens, + &local->loc, ts); + + call_count = call_count - 3; + + if (!xattr) + continue; + + STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->setxattr, + &local->loc, xattr, 0); + call_count--; + } + + return 0; +} + + +int +afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int source = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "getxattr of %s failed on subvolume %s (%s). proceeding without xattr", + local->loc.path, priv->children[source]->name, + strerror (op_errno)); + + afr_sh_metadata_sync (frame, this, NULL); + } else { + dict_del (xattr, AFR_DATA_PENDING); + dict_del (xattr, AFR_METADATA_PENDING); + dict_del (xattr, AFR_ENTRY_PENDING); + afr_sh_metadata_sync (frame, this, xattr); + } + + return 0; +} + + +int +afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int active_sinks = 0; + int source = 0; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] == 0 && local->child_up[i] == 1) { + active_sinks++; + sh->success[i] = 1; + } + } + sh->success[source] = 1; + + if (active_sinks == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "no active sinks for performing self-heal on file %s", + local->loc.path); + afr_sh_metadata_finish (frame, this); + return 0; + } + sh->active_sinks = active_sinks; + + gf_log (this->name, GF_LOG_DEBUG, + "syncing metadata of %s from subvolume %s to %d active sinks", + local->loc.path, priv->children[source]->name, active_sinks); + + STACK_WIND (frame, afr_sh_metadata_getxattr_cbk, + priv->children[source], + priv->children[source]->fops->getxattr, + &local->loc, NULL); + + return 0; +} + + +int +afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int nsources = 0; + int source = 0; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr, + priv->child_count, AFR_METADATA_PENDING); + + afr_sh_print_pending_matrix (sh->pending_matrix, this); + + afr_sh_mark_sources (sh->pending_matrix, sh->sources, + priv->child_count); + + afr_sh_supress_errenous_children (sh->sources, sh->child_errno, + priv->child_count); + + nsources = afr_sh_source_count (sh->sources, priv->child_count); + + if ((nsources == 0) + && (priv->favorite_child != -1) + && (sh->child_errno[priv->favorite_child] == 0)) { + + gf_log (this->name, GF_LOG_WARNING, + "Picking favorite child %s as authentic source to resolve conflicting metadata of %s", + priv->children[priv->favorite_child]->name, + local->loc.path); + + sh->sources[priv->favorite_child] = 1; + + nsources = afr_sh_source_count (sh->sources, + priv->child_count); + } + + if (nsources == 0) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to resolve conflicting metadata of %s. " + "Please resolve manually by fixing the " + "permissions/ownership of %s on your subvolumes. " + "You can also consider 'option favorite-child <>'", + local->loc.path, local->loc.path); + + local->govinda_gOvinda = 1; + + afr_sh_metadata_finish (frame, this); + return 0; + } + + source = afr_sh_select_source (sh->sources, priv->child_count); + sh->source = source; + + /* detect changes not visible through pending flags -- JIC */ + for (i = 0; i < priv->child_count; i++) { + if (i == source || sh->child_errno[i]) + continue; + + if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source])) + sh->sources[i] = 0; + + if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source])) + sh->sources[i] = 0; + } + + afr_sh_metadata_sync_prepare (frame, this); + + return 0; +} + + +int +afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "path %s on subvolume %s is of mode 0%o", + local->loc.path, + priv->children[child_index]->name, + buf->st_mode); + + sh->buf[child_index] = *buf; + if (xattr) + sh->xattr[child_index] = dict_ref (xattr); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "path %s on subvolume %s => -1 (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + + sh->child_errno[child_index] = op_errno; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_metadata_fix (frame, this); + + return 0; +} + + +int +afr_sh_metadata_lookup (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + dict_t *xattr_req = NULL; + int ret = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + local->call_count = call_count; + + xattr_req = dict_new(); + + if (xattr_req) + ret = dict_set_uint64 (xattr_req, AFR_METADATA_PENDING, + priv->child_count * sizeof(int32_t)); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s on %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, xattr_req); + if (!--call_count) + break; + } + } + + if (xattr_req) + dict_unref (xattr_req); + + return 0; +} + + +int +afr_sh_metadata_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = (long) cookie; + + /* TODO: what if lock fails? */ + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + sh->op_failed = 1; + + gf_log (this->name, + (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), + "locking of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "inode of %s on child %d locked", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed) { + afr_sh_metadata_finish (frame, this); + return 0; + } + + afr_sh_metadata_lookup (frame, this); + } + + return 0; +} + + +int +afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + struct flock flock = {0, }; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + flock.l_start = 0; + flock.l_len = 0; + flock.l_type = F_WRLCK; + + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "locking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_lk_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->inodelk, + &local->loc, F_SETLK, &flock); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_self_heal_metadata (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = this->private; + + + local = frame->local; + sh = &local->self_heal; + + if (local->need_metadata_self_heal && priv->metadata_self_heal) { + afr_sh_metadata_lock (frame, this); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to data check on %s", + local->loc.path); + afr_sh_metadata_done (frame, this); + } + + return 0; +} + diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h new file mode 100644 index 000000000..1c97a9bc1 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -0,0 +1,52 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __AFR_SELF_HEAL_H__ +#define __AFR_SELF_HEAL_H__ + +#include <sys/stat.h> + +#define FILETYPE_DIFFERS(buf1,buf2) ((S_IFMT & ((struct stat *)buf1)->st_mode) != (S_IFMT & ((struct stat *)buf2)->st_mode)) +#define PERMISSION_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_mode) != (((struct stat *)buf2)->st_mode)) +#define OWNERSHIP_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_uid) != (((struct stat *)buf2)->st_uid) || (((struct stat *)buf1)->st_gid != (((struct stat *)buf2)->st_gid))) +#define SIZE_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_size) != (((struct stat *)buf2)->st_size)) + + + +int +afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this); +int +afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this); +int +afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this); + +int +afr_self_heal_entry (call_frame_t *frame, xlator_t *this); + +int +afr_self_heal_data (call_frame_t *frame, xlator_t *this); + +int +afr_self_heal_metadata (call_frame_t *frame, xlator_t *this); + +int +afr_self_heal (call_frame_t *frame, xlator_t *this, + int (*completion_cbk) (call_frame_t *, xlator_t *)); + +#endif /* __AFR_SELF_HEAL_H__ */ diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c new file mode 100644 index 000000000..3df9f07e5 --- /dev/null +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -0,0 +1,957 @@ +/* + Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include "dict.h" +#include "byte-order.h" + +#include "afr.h" +#include "afr-transaction.h" + +#include <signal.h> + + +static void +__mark_all_pending (int32_t *pending, int child_count) +{ + int i; + + for (i = 0; i < child_count; i++) + pending[i] = hton32 (1); +} + + +static void +__mark_child_dead (int32_t *pending, int child_count, int child) +{ + pending[child] = 0; +} + + +static void +__mark_down_children (int32_t *pending, int child_count, unsigned char *child_up) +{ + int i; + + for (i = 0; i < child_count; i++) + if (!child_up[i]) + pending[i] = 0; +} + + +static void +__mark_all_success (int32_t *pending, int child_count) +{ + int i; + + for (i = 0; i < child_count; i++) + pending[i] = hton32 (-1); +} + + +static int +__is_first_write_on_fd (xlator_t *this, fd_t *fd) +{ + int op_ret = 0; + int _ret = -1; + + _ret = fd_ctx_get (fd, this, NULL); + if (_ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "first writev() on fd=%p, writing changelog", + fd); + + _ret = fd_ctx_set (fd, this, 0xaf1); + op_ret = 1; + } + + return op_ret; +} + + +static int +__changelog_enabled (afr_private_t *priv, afr_transaction_type type) +{ + int ret = 0; + + switch (type) { + case AFR_DATA_TRANSACTION: + if (priv->data_change_log) + ret = 1; + + break; + + case AFR_METADATA_TRANSACTION: + if (priv->metadata_change_log) + ret = 1; + + break; + + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + if (priv->entry_change_log) + ret = 1; + + break; + + case AFR_FLUSH_TRANSACTION: + ret = 1; + } + + return ret; +} + + +static int +__changelog_needed_pre_op (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + fd_t * fd = NULL; + + int op_ret = 0; + + priv = this->private; + local = frame->local; + + if (__changelog_enabled (priv, local->transaction.type)) { + switch (local->op) { + + case GF_FOP_WRITE: + case GF_FOP_FTRUNCATE: + /* + if it's a data transaction, we write the changelog + only on the first write on an fd + */ + + fd = local->fd; + if (!fd || __is_first_write_on_fd (this, fd)) + op_ret = 1; + + break; + + case GF_FOP_FLUSH: + /* only do post-op on flush() */ + + op_ret = 0; + break; + + default: + op_ret = 1; + } + } + + return op_ret; +} + + +static int +__changelog_needed_post_op (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int ret = 0; + afr_transaction_type type = -1; + + priv = this->private; + local = frame->local; + type = local->transaction.type; + + if (__changelog_enabled (priv, type) + && (local->op != GF_FOP_WRITE) + && (local->op != GF_FOP_FTRUNCATE)) + ret = 1; + + return ret; +} + + +static int +afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) +{ + int ret = 0; + + switch (type) { + case AFR_FLUSH_TRANSACTION: + case AFR_DATA_TRANSACTION: + ret = priv->data_lock_server_count; + break; + + case AFR_METADATA_TRANSACTION: + ret = priv->metadata_lock_server_count; + break; + + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + ret = priv->entry_lock_server_count; + break; + } + + return ret; +} + + +/* {{{ unlock */ + +int32_t +afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local; + int call_count = 0; + + local = frame->local; + + LOCK (&frame->lock); + { + call_count = --local->call_count; + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + local->transaction.done (frame, this); + } + + return 0; +} + + +int +afr_unlock (call_frame_t *frame, xlator_t *this) +{ + struct flock flock; + + int i = 0; + int call_count = 0; + + afr_local_t *local = NULL; + afr_private_t * priv = this->private; + + local = frame->local; + + call_count = afr_locked_nodes_count (local->transaction.locked_nodes, + priv->child_count); + + if (call_count == 0) { + local->transaction.done (frame, this); + return 0; + } + + if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) + call_count *= 2; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + flock.l_start = local->transaction.start; + flock.l_len = local->transaction.len; + flock.l_type = F_UNLCK; + + if (local->transaction.locked_nodes[i]) { + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + case AFR_FLUSH_TRANSACTION: + + if (local->fd) { + STACK_WIND (frame, afr_unlock_common_cbk, + priv->children[i], + priv->children[i]->fops->finodelk, + local->fd, F_SETLK, &flock); + } else { + STACK_WIND (frame, afr_unlock_common_cbk, + priv->children[i], + priv->children[i]->fops->inodelk, + &local->loc, F_SETLK, &flock); + } + + break; + + case AFR_ENTRY_RENAME_TRANSACTION: + + STACK_WIND (frame, afr_unlock_common_cbk, + priv->children[i], + priv->children[i]->fops->entrylk, + &local->transaction.new_parent_loc, + local->transaction.new_basename, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + + call_count--; + + /* fall through */ + + case AFR_ENTRY_TRANSACTION: + if (local->fd) { + STACK_WIND (frame, afr_unlock_common_cbk, + priv->children[i], + priv->children[i]->fops->fentrylk, + local->fd, + local->transaction.basename, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + } else { + STACK_WIND (frame, afr_unlock_common_cbk, + priv->children[i], + priv->children[i]->fops->entrylk, + &local->transaction.parent_loc, + local->transaction.basename, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + + } + break; + } + + if (!--call_count) + break; + } + } + + return 0; +} + +/* }}} */ + + +/* {{{ pending */ + +int32_t +afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int call_count = -1; + + priv = this->private; + local = frame->local; + + LOCK (&frame->lock); + { + call_count = --local->call_count; + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + if (afr_lock_server_count (priv, local->transaction.type) == 0) { + local->transaction.done (frame, this); + } else { + afr_unlock (frame, this); + } + } + + return 0; +} + + +int +afr_changelog_post_op (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = this->private; + + int ret = 0; + int i = 0; + int call_count = 0; + + afr_local_t * local = NULL; + dict_t * xattr = dict_ref (get_new_dict ()); + + local = frame->local; + + __mark_all_success (local->pending_array, priv->child_count); + __mark_down_children (local->pending_array, priv->child_count, local->child_up); + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + call_count *= 2; + } + + local->call_count = call_count; + + if (call_count == 0) { + /* no child is up */ + dict_unref (xattr); + afr_unlock (frame, this); + return 0; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + ret = dict_set_static_bin (xattr, local->transaction.pending, + local->pending_array, + priv->child_count * sizeof (int32_t)); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "failed to set pending entry"); + + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + case AFR_FLUSH_TRANSACTION: + { + if (local->fd) + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr); + else + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, xattr); + } + break; + + case AFR_ENTRY_RENAME_TRANSACTION: + { + STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.new_parent_loc, + GF_XATTROP_ADD_ARRAY, xattr); + + call_count--; + } + + /* + set it again because previous stack_wind + might have already returned (think of case + where subvolume is posix) and would have + used the dict as placeholder for return + value + */ + ret = dict_set_static_bin (xattr, local->transaction.pending, + local->pending_array, + priv->child_count * sizeof (int32_t)); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "failed to set pending entry"); + + /* fall through */ + + case AFR_ENTRY_TRANSACTION: + { + if (local->fd) + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr); + else + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.parent_loc, + GF_XATTROP_ADD_ARRAY, xattr); + } + break; + } + + if (!--call_count) + break; + } + } + + dict_unref (xattr); + return 0; +} + + +int32_t +afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr) +{ + afr_local_t * local = NULL; + afr_private_t * priv = this->private; + loc_t * loc = NULL; + + int call_count = -1; + int child_index = (long) cookie; + + local = frame->local; + loc = &local->loc; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->child_up[child_index] = 0; + + if (op_errno == ENOTSUP) { + gf_log (this->name, GF_LOG_ERROR, + "xattrop not supported by %s", + priv->children[child_index]->name); + local->op_ret = -1; + } else if (!child_went_down (op_ret, op_errno)) { + gf_log (this->name, GF_LOG_ERROR, + "xattrop failed on child %s: %s", + priv->children[child_index]->name, + strerror (op_errno)); + } + local->op_errno = op_errno; + } + + call_count = --local->call_count; + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + if ((local->op_ret == -1) && + (local->op_errno == ENOTSUP)) { + local->transaction.resume (frame, this); + } else { + local->transaction.fop (frame, this); + } + } + + return 0; +} + + +int +afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = this->private; + + int i = 0; + int ret = 0; + int call_count = 0; + dict_t *xattr = NULL; + + afr_local_t *local = NULL; + + local = frame->local; + xattr = get_new_dict (); + dict_ref (xattr); + + call_count = afr_up_children_count (priv->child_count, + local->child_up); + + if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + call_count *= 2; + } + + if (call_count == 0) { + /* no child is up */ + dict_unref (xattr); + afr_unlock (frame, this); + return 0; + } + + local->call_count = call_count; + + __mark_all_pending (local->pending_array, priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + ret = dict_set_static_bin (xattr, + local->transaction.pending, + local->pending_array, + (priv->child_count * + sizeof (int32_t))); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "failed to set pending entry"); + + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + case AFR_FLUSH_TRANSACTION: + { + if (local->fd) + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr); + else + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &(local->loc), + GF_XATTROP_ADD_ARRAY, xattr); + } + break; + + case AFR_ENTRY_RENAME_TRANSACTION: + { + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.new_parent_loc, + GF_XATTROP_ADD_ARRAY, xattr); + + call_count--; + } + + + /* + set it again because previous stack_wind + might have already returned (think of case + where subvolume is posix) and would have + used the dict as placeholder for return + value + */ + + ret = dict_set_static_bin (xattr, local->transaction.pending, + local->pending_array, + priv->child_count * sizeof (int32_t)); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "failed to set pending entry"); + + /* fall through */ + + case AFR_ENTRY_TRANSACTION: + { + if (local->fd) + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr); + else + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.parent_loc, + GF_XATTROP_ADD_ARRAY, xattr); + } + + break; + } + + if (!--call_count) + break; + } + } + + dict_unref (xattr); + return 0; +} + +/* }}} */ + +/* {{{ lock */ + +static +int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index); + +int32_t +afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int done = 0; + int child_index = (long) cookie; + + int call_count = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + /* wait for the other lock to return */ + call_count = --local->call_count; + } + + if (op_ret == -1) { + if (op_errno == ENOSYS) { + /* return ENOTSUP */ + gf_log (this->name, GF_LOG_ERROR, + "subvolume does not support locking. " + "please load features/posix-locks xlator on server"); + local->op_ret = op_ret; + done = 1; + } + + local->child_up[child_index] = 0; + local->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + if ((local->op_ret == -1) && + (local->op_errno == ENOSYS)) { + afr_unlock (frame, this); + } else { + local->transaction.locked_nodes[child_index] = 1; + local->transaction.lock_count++; + afr_lock_rec (frame, this, child_index + 1); + } + } + + return 0; +} + + +static loc_t * +lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2) +{ + int ret = 0; + + ret = strcmp (l1->path, l2->path); + + if (ret == 0) + ret = strcmp (b1, b2); + + if (ret <= 0) + return l1; + else + return l2; +} + + +static +int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + struct flock flock; + + loc_t * lower = NULL; + loc_t * higher = NULL; + + const char *lower_name = NULL; + const char *higher_name = NULL; + + local = frame->local; + priv = this->private; + + flock.l_start = local->transaction.start; + flock.l_len = local->transaction.len; + flock.l_type = F_WRLCK; + + /* skip over children that are down */ + while ((child_index < priv->child_count) + && !local->child_up[child_index]) + child_index++; + + if ((child_index == priv->child_count) && + local->transaction.lock_count == 0) { + + gf_log (this->name, GF_LOG_DEBUG, + "unable to lock on even one child"); + + local->op_ret = -1; + local->op_errno = EAGAIN; + + local->transaction.done (frame, this); + + return 0; + + } + + if ((child_index == priv->child_count) + || (local->transaction.lock_count == + afr_lock_server_count (priv, local->transaction.type))) { + + /* we're done locking */ + + if (__changelog_needed_pre_op (frame, this)) { + afr_changelog_pre_op (frame, this); + } else { + local->transaction.fop (frame, this); + } + + return 0; + } + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + case AFR_FLUSH_TRANSACTION: + + if (local->fd) { + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->finodelk, + local->fd, F_SETLKW, &flock); + + } else { + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->inodelk, + &local->loc, F_SETLKW, &flock); + } + + break; + + case AFR_ENTRY_RENAME_TRANSACTION: + { + local->call_count = 2; + + lower = lower_path (&local->transaction.parent_loc, + local->transaction.basename, + &local->transaction.new_parent_loc, + local->transaction.new_basename); + + lower_name = (lower == &local->transaction.parent_loc ? + local->transaction.basename : + local->transaction.new_basename); + + higher = (lower == &local->transaction.parent_loc ? + &local->transaction.new_parent_loc : + &local->transaction.parent_loc); + + higher_name = (higher == &local->transaction.parent_loc ? + local->transaction.basename : + local->transaction.new_basename); + + + /* TODO: these locks should be blocking */ + + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->entrylk, + lower, lower_name, + ENTRYLK_LOCK, ENTRYLK_WRLCK); + + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->entrylk, + higher, higher_name, + ENTRYLK_LOCK, ENTRYLK_WRLCK); + + break; + } + + case AFR_ENTRY_TRANSACTION: + if (local->fd) { + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->fentrylk, + local->fd, + local->transaction.basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK); + } else { + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->entrylk, + &local->transaction.parent_loc, + local->transaction.basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK); + } + + break; + } + + return 0; +} + + +int32_t afr_lock (call_frame_t *frame, xlator_t *this) +{ + return afr_lock_rec (frame, this, 0); +} + + +/* }}} */ + +int32_t +afr_transaction_resume (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + local = frame->local; + priv = this->private; + + if (__changelog_needed_post_op (frame, this)) { + afr_changelog_post_op (frame, this); + } else { + if (afr_lock_server_count (priv, local->transaction.type) == 0) { + local->transaction.done (frame, this); + } else { + afr_unlock (frame, this); + } + } + + return 0; +} + + +/** + * afr_transaction_child_died - inform that a child died during an fop + */ + +void +afr_transaction_child_died (call_frame_t *frame, xlator_t *this, int child_index) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + local = frame->local; + priv = this->private; + + __mark_child_dead (local->pending_array, priv->child_count, child_index); +} + + +int32_t +afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + local = frame->local; + priv = this->private; + + afr_transaction_local_init (local, priv); + + local->transaction.resume = afr_transaction_resume; + local->transaction.type = type; + + if (afr_lock_server_count (priv, local->transaction.type) == 0) { + if (__changelog_needed_pre_op (frame, this)) { + afr_changelog_pre_op (frame, this); + } else { + local->transaction.fop (frame, this); + } + } else { + afr_lock (frame, this); + } + + return 0; +} diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h new file mode 100644 index 000000000..49cdd219f --- /dev/null +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -0,0 +1,36 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __TRANSACTION_H__ +#define __TRANSACTION_H__ + +#define AFR_METADATA_PENDING "trusted.glusterfs.afr.metadata-pending" + +#define AFR_DATA_PENDING "trusted.glusterfs.afr.data-pending" + +#define AFR_ENTRY_PENDING "trusted.glusterfs.afr.entry-pending" + +void +afr_transaction_child_died (call_frame_t *frame, xlator_t *this, + int child_index); + +int32_t +afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); + +#endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c new file mode 100644 index 000000000..e4c1a8479 --- /dev/null +++ b/xlators/cluster/afr/src/afr.c @@ -0,0 +1,2338 @@ +/* + Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-inode-read.h" +#include "afr-inode-write.h" +#include "afr-dir-read.h" +#include "afr-dir-write.h" +#include "afr-transaction.h" + +#include "afr-self-heal.h" + + +/** + * afr_local_cleanup - cleanup everything in frame->local + */ + +void +afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) +{ + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + + + sh = &local->self_heal; + priv = this->private; + + if (sh->buf) + FREE (sh->buf); + + if (sh->xattr) { + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) { + dict_unref (sh->xattr[i]); + sh->xattr[i] = NULL; + } + } + FREE (sh->xattr); + } + + if (sh->child_errno) + FREE (sh->child_errno); + + if (sh->pending_matrix) { + for (i = 0; i < priv->child_count; i++) { + FREE (sh->pending_matrix[i]); + } + FREE (sh->pending_matrix); + } + + if (sh->delta_matrix) { + for (i = 0; i < priv->child_count; i++) { + FREE (sh->delta_matrix[i]); + } + FREE (sh->delta_matrix); + } + + if (sh->sources) + FREE (sh->sources); + + if (sh->success) + FREE (sh->success); + + if (sh->healing_fd) { + fd_unref (sh->healing_fd); + sh->healing_fd = NULL; + } + + loc_wipe (&sh->parent_loc); +} + + +void +afr_local_cleanup (afr_local_t *local, xlator_t *this) +{ + if (!local) + return; + + afr_local_sh_cleanup (local, this); + + FREE (local->child_errno); + FREE (local->pending_array); + + loc_wipe (&local->loc); + loc_wipe (&local->newloc); + + FREE (local->transaction.locked_nodes); + FREE (local->transaction.child_errno); + + FREE (local->transaction.basename); + FREE (local->transaction.new_basename); + + loc_wipe (&local->transaction.parent_loc); + loc_wipe (&local->transaction.new_parent_loc); + + if (local->fd) + fd_unref (local->fd); + + if (local->xattr_req) + dict_unref (local->xattr_req); + + FREE (local->child_up); + + { /* lookup */ + if (local->cont.lookup.xattr) + dict_unref (local->cont.lookup.xattr); + } + + { /* getxattr */ + if (local->cont.getxattr.name) + FREE (local->cont.getxattr.name); + } + + { /* lk */ + if (local->cont.lk.locked_nodes) + FREE (local->cont.lk.locked_nodes); + } + + { /* checksum */ + if (local->cont.checksum.file_checksum) + FREE (local->cont.checksum.file_checksum); + if (local->cont.checksum.dir_checksum) + FREE (local->cont.checksum.dir_checksum); + } + + { /* create */ + if (local->cont.create.fd) + fd_unref (local->cont.create.fd); + } + + { /* writev */ + FREE (local->cont.writev.vector); + } + + { /* setxattr */ + if (local->cont.setxattr.dict) + dict_unref (local->cont.setxattr.dict); + } + + { /* removexattr */ + FREE (local->cont.removexattr.name); + } + + { /* symlink */ + FREE (local->cont.symlink.linkpath); + } +} + + +int +afr_frame_return (call_frame_t *frame) +{ + afr_local_t *local = NULL; + int call_count = 0; + + local = frame->local; + + LOCK (&frame->lock); + { + call_count = --local->call_count; + } + UNLOCK (&frame->lock); + + return call_count; +} + +/** + * first_up_child - return the index of the first child that is up + */ + +int +afr_first_up_child (afr_private_t *priv) +{ + xlator_t ** children = NULL; + int ret = -1; + int i = 0; + + LOCK (&priv->lock); + { + children = priv->children; + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i]) { + ret = i; + break; + } + } + } + UNLOCK (&priv->lock); + + return ret; +} + + +/** + * up_children_count - return the number of children that are up + */ + +int +afr_up_children_count (int child_count, unsigned char *child_up) +{ + int i = 0; + int ret = 0; + + for (i = 0; i < child_count; i++) + if (child_up[i]) + ret++; + return ret; +} + + +int +afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) +{ + int ret = 0; + int i; + + for (i = 0; i < child_count; i++) + if (locked_nodes[i]) + ret++; + + return ret; +} + + +ino64_t +afr_itransform (ino64_t ino, int child_count, int child_index) +{ + ino64_t scaled_ino = -1; + + if (ino == ((uint64_t) -1)) { + scaled_ino = ((uint64_t) -1); + goto out; + } + + scaled_ino = (ino * child_count) + child_index; + +out: + return scaled_ino; +} + + +int +afr_deitransform_orig (ino64_t ino, int child_count) +{ + int index = -1; + + index = ino % child_count; + + return index; +} + + +int +afr_deitransform (ino64_t ino, int child_count) +{ + return 0; +} + + +int +afr_self_heal_cbk (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + int ret = -1; + + local = frame->local; + + if (local->govinda_gOvinda) { + ret = inode_ctx_put (local->cont.lookup.inode, this, 1); + + if (ret < 0) { + local->op_ret = -1; + local->op_errno = -ret; + } + } else { + inode_ctx_del (local->cont.lookup.inode, this, NULL); + } + + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->cont.lookup.inode, + &local->cont.lookup.buf, + local->cont.lookup.xattr); + + return 0; +} + + +int +afr_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + struct stat * lookup_buf = NULL; + int call_count = -1; + int child_index = -1; + int prev_child_index = -1; + uint32_t open_fd_count = 0; + int ret = 0; + + child_index = (long) cookie; + priv = this->private; + + LOCK (&frame->lock); + { + local = frame->local; + + lookup_buf = &local->cont.lookup.buf; + + if (op_ret == -1) { + if (op_errno == ENOENT) + local->enoent_count++; + + if (op_errno != ENOTCONN) + local->op_errno = op_errno; + + goto unlock; + } + + if (afr_sh_has_metadata_pending (xattr, child_index, this)) + local->need_metadata_self_heal = 1; + + if (afr_sh_has_entry_pending (xattr, child_index, this)) + local->need_entry_self_heal = 1; + + if (afr_sh_has_data_pending (xattr, child_index, this)) + local->need_data_self_heal = 1; + + ret = dict_get_uint32 (xattr, GLUSTERFS_OPEN_FD_COUNT, + &open_fd_count); + local->open_fd_count += open_fd_count; + + /* in case of revalidate, we need to send stat of the + * child whose stat was sent during the first lookup. + * (so that time stamp does not vary with revalidate. + * in case it is down, stat of the fist success will + * be replied */ + + /* inode number should be preserved across revalidates */ + + if (local->success_count == 0) { + local->op_ret = op_ret; + + local->cont.lookup.inode = inode; + local->cont.lookup.xattr = dict_ref (xattr); + + *lookup_buf = *buf; + lookup_buf->st_ino = afr_itransform (buf->st_ino, + priv->child_count, + child_index); + } else { + if (FILETYPE_DIFFERS (buf, lookup_buf)) { + /* mismatching filetypes with same name + -- Govinda !! GOvinda !!! + */ + local->govinda_gOvinda = 1; + } + + if (PERMISSION_DIFFERS (buf, lookup_buf)) { + /* mismatching permissions */ + local->need_metadata_self_heal = 1; + } + + if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { + /* mismatching permissions */ + local->need_metadata_self_heal = 1; + } + + if (SIZE_DIFFERS (buf, lookup_buf) + && S_ISREG (buf->st_mode)) { + local->need_data_self_heal = 1; + } + + prev_child_index = afr_deitransform_orig (lookup_buf->st_ino, + priv->child_count); + if (child_index < prev_child_index) { + *lookup_buf = *buf; + lookup_buf->st_ino = afr_itransform (buf->st_ino, + priv->child_count, + child_index); + } + } + + local->success_count++; + } +unlock: + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (local->op_ret == 0) { + /* KLUDGE: assuming DHT will not itransform in + revalidate */ + if (local->cont.lookup.inode->ino) + lookup_buf->st_ino = + local->cont.lookup.inode->ino; + } + + if (local->success_count && local->enoent_count) { + local->need_metadata_self_heal = 1; + local->need_data_self_heal = 1; + local->need_entry_self_heal = 1; + } + + if (local->success_count) { + /* check for govinda_gOvinda case in previous lookup */ + if (!inode_ctx_get (local->cont.lookup.inode, + this, NULL)) + local->need_data_self_heal = 1; + } + + if ((local->need_metadata_self_heal + || local->need_data_self_heal + || local->need_entry_self_heal) + && (!local->open_fd_count)) { + + if (!local->cont.lookup.inode->st_mode) { + /* fix for RT #602 */ + local->cont.lookup.inode->st_mode = + lookup_buf->st_mode; + } + + afr_self_heal (frame, this, afr_self_heal_cbk); + } else { + AFR_STACK_UNWIND (frame, local->op_ret, + local->op_errno, + local->cont.lookup.inode, + &local->cont.lookup.buf, + local->cont.lookup.xattr); + } + } + + return 0; +} + + +int +afr_lookup (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *xattr_req) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int ret = -1; + int i = 0; + int32_t op_errno = 0; + + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + local->op_ret = -1; + + frame->local = local; + + loc_copy (&local->loc, loc); + + local->reval_child_index = 0; + + local->call_count = priv->child_count; + + local->child_up = memdup (priv->child_up, priv->child_count); + local->child_count = afr_up_children_count (priv->child_count, + local->child_up); + + /* By default assume ENOTCONN. On success it will be set to 0. */ + local->op_errno = ENOTCONN; + + if ((xattr_req == NULL) + && (priv->metadata_self_heal + || priv->data_self_heal + || priv->entry_self_heal)) + local->xattr_req = dict_new (); + else + local->xattr_req = dict_ref (xattr_req); + + if (priv->metadata_self_heal) { + ret = dict_set_uint64 (local->xattr_req, AFR_METADATA_PENDING, + priv->child_count * sizeof(int32_t)); + } + + if (priv->data_self_heal) { + ret = dict_set_uint64 (local->xattr_req, AFR_DATA_PENDING, + priv->child_count * sizeof(int32_t)); + } + + if (priv->entry_self_heal) { + ret = dict_set_uint64 (local->xattr_req, AFR_ENTRY_PENDING, + priv->child_count * sizeof(int32_t)); + } + + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_OPEN_FD_COUNT, 0); + + for (i = 0; i < priv->child_count; i++) { + STACK_WIND_COOKIE (frame, afr_lookup_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + loc, local->xattr_req); + } + + ret = 0; +out: + if (ret == -1) + AFR_STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL, NULL); + + return 0; +} + + +/* {{{ open */ + +int +afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = frame->local; + + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->fd); + return 0; +} + + +int +afr_open_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + fd_t *fd) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + + priv = this->private; + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + } + + if (op_ret >= 0) { + local->op_ret = op_ret; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if ((local->cont.open.flags & O_TRUNC) + && (local->op_ret >= 0)) { + STACK_WIND (frame, afr_open_ftruncate_cbk, + this, this->fops->ftruncate, + fd, 0); + } else { + AFR_STACK_UNWIND (frame, local->op_ret, + local->op_errno, local->fd); + } + } + + return 0; +} + + +int +afr_open (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, fd_t *fd) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int i = 0; + int ret = -1; + + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + int32_t wind_flags = flags & (~O_TRUNC); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + priv = this->private; + + ret = inode_ctx_get (loc->inode, this, NULL); + if (ret == 0) { + /* if ctx is set it means self-heal failed */ + + gf_log (this->name, GF_LOG_WARNING, + "returning EIO, file has to be manually corrected " + "in backend"); + op_errno = EIO; + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + call_count = local->call_count; + + local->cont.open.flags = flags; + local->fd = fd_ref (fd); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->open, + loc, wind_flags, fd); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, fd); + } + + return 0; +} + +/* }}} */ + +/* {{{ flush */ + +int +afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_flush_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int i = 0; + int call_count = -1; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_flush_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->flush, + local->fd); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_flush_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +afr_simple_flush_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +static int +__is_fd_ctx_set (xlator_t *this, fd_t *fd) +{ + int _ret = 0; + int op_ret = 0; + + _ret = fd_ctx_get (fd, this, NULL); + if (_ret == 0) + op_ret = 1; + + return op_ret; +} + + +int +afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int ret = -1; + int i = 0; + int call_count = 0; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + + if (__is_fd_ctx_set (this, fd)) { + local->op = GF_FOP_FLUSH; + local->transaction.fop = afr_flush_wind; + local->transaction.done = afr_flush_done; + + local->fd = fd_ref (fd); + + local->transaction.start = 0; + local->transaction.len = 0; + + local->transaction.pending = AFR_DATA_PENDING; + + afr_transaction (frame, this, AFR_FLUSH_TRANSACTION); + } else { + /* + * if fd's ctx is not set, then there is no need + * to erase changelog. So just send the flush + */ + + call_count = local->call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_simple_flush_cbk, + priv->children[i], + priv->children[i]->fops->flush, + fd); + + if (!--call_count) + break; + } + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ fsync */ + +int +afr_fsync_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t datasync) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_fsync_cbk, + priv->children[i], + priv->children[i]->fops->fsync, + fd, datasync); + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + +/* }}} */ + +/* {{{ fsync */ + +int32_t +afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t datasync) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_fsync_cbk, + priv->children[i], + priv->children[i]->fops->fsyncdir, + fd, datasync); + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + +/* }}} */ + +/* {{{ xattrop */ + +int32_t +afr_xattrop_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xattr) +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr); + + return 0; +} + + +int32_t +afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_xattrop_cbk, + priv->children[i], + priv->children[i]->fops->xattrop, + loc, optype, xattr); + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + +/* }}} */ + +/* {{{ fxattrop */ + +int32_t +afr_fxattrop_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xattr) +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr); + + return 0; +} + + +int32_t +afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_fxattrop_cbk, + priv->children[i], + priv->children[i]->fops->fxattrop, + fd, optype, xattr); + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + +/* }}} */ + + +int32_t +afr_inodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_inodelk (call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t cmd, struct flock *flock) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_inodelk_cbk, + priv->children[i], + priv->children[i]->fops->inodelk, + loc, cmd, flock); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + +int32_t +afr_finodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_finodelk (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t cmd, struct flock *flock) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_finodelk_cbk, + priv->children[i], + priv->children[i]->fops->finodelk, + fd, cmd, flock); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + +int32_t +afr_entrylk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *basename, entrylk_cmd cmd, entrylk_type type) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_entrylk_cbk, + priv->children[i], + priv->children[i]->fops->entrylk, + loc, basename, cmd, type); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + + +int32_t +afr_fentrylk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_fentrylk (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_fentrylk_cbk, + priv->children[i], + priv->children[i]->fops->fentrylk, + fd, basename, cmd, type); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + +int32_t +afr_checksum_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + uint8_t *file_checksum, uint8_t *dir_checksum) + +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0 && (local->op_ret != 0)) { + local->op_ret = 0; + + local->cont.checksum.file_checksum = MALLOC (ZR_FILENAME_MAX); + memcpy (local->cont.checksum.file_checksum, file_checksum, + ZR_FILENAME_MAX); + + local->cont.checksum.dir_checksum = MALLOC (ZR_FILENAME_MAX); + memcpy (local->cont.checksum.dir_checksum, dir_checksum, + ZR_FILENAME_MAX); + + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->cont.checksum.file_checksum, + local->cont.checksum.dir_checksum); + + return 0; +} + + +int32_t +afr_checksum (call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t flag) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_checksum_cbk, + priv->children[i], + priv->children[i]->fops->checksum, + loc, flag); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + +int32_t +afr_statfs_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct statvfs *statvfs) +{ + afr_local_t *local = NULL; + + int call_count = 0; + + LOCK (&frame->lock); + { + local = frame->local; + + if (op_ret == 0) { + local->op_ret = op_ret; + + if (local->cont.statfs.buf_set) { + if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail) + local->cont.statfs.buf = *statvfs; + } else { + local->cont.statfs.buf = *statvfs; + local->cont.statfs.buf_set = 1; + } + } + + if (op_ret == -1) + local->op_errno = op_errno; + + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->cont.statfs.buf); + + return 0; +} + + +int32_t +afr_statfs (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + afr_private_t * priv = NULL; + int child_count = 0; + afr_local_t * local = NULL; + int i = 0; + + int ret = -1; + int call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + priv = this->private; + child_count = priv->child_count; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + call_count = local->call_count; + + for (i = 0; i < child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_statfs_cbk, + priv->children[i], + priv->children[i]->fops->statfs, + loc); + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + return 0; +} + + +int32_t +afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct flock *lock) +{ + afr_local_t * local = NULL; + + int call_count = -1; + + local = frame->local; + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + lock); + + return 0; +} + + +int32_t +afr_lk_unlock (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int i; + int call_count = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes, + priv->child_count); + + if (call_count == 0) { + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->cont.lk.flock); + return 0; + } + + local->call_count = call_count; + + local->cont.lk.flock.l_type = F_UNLCK; + + for (i = 0; i < priv->child_count; i++) { + if (local->cont.lk.locked_nodes[i]) { + STACK_WIND (frame, afr_lk_unlock_cbk, + priv->children[i], + priv->children[i]->fops->lk, + local->fd, F_SETLK, + &local->cont.lk.flock); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int32_t +afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct flock *lock) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + call_count = --local->call_count; + + if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) { + local->op_ret = -1; + local->op_errno = op_errno; + + afr_lk_unlock (frame, this); + return 0; + } + + if (op_ret == 0) { + local->op_ret = 0; + local->op_errno = 0; + local->cont.lk.flock = *lock; + local->cont.lk.locked_nodes[child_index] = 1; + } + + child_index++; + + if (child_index < priv->child_count) { + STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->lk, + local->fd, local->cont.lk.cmd, + &local->cont.lk.flock); + } else if (local->op_ret == -1) { + /* all nodes have gone down */ + + AFR_STACK_UNWIND (frame, -1, ENOTCONN, &local->cont.lk.flock); + } else { + /* locking has succeeded on all nodes that are up */ + + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->cont.lk.flock); + } + + return 0; +} + + +int +afr_lk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t cmd, + struct flock *flock) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int i = 0; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_INIT (local, priv); + + frame->local = local; + + local->cont.lk.locked_nodes = CALLOC (priv->child_count, + sizeof (*local->cont.lk.locked_nodes)); + + if (!local->cont.lk.locked_nodes) { + gf_log (this->name, GF_LOG_ERROR, "out of memory :("); + op_errno = ENOMEM; + goto out; + } + + local->fd = fd_ref (fd); + local->cont.lk.cmd = cmd; + local->cont.lk.flock = *flock; + + STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0, + priv->children[i], + priv->children[i]->fops->lk, + fd, cmd, flock); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + return 0; +} + + +/** + * find_child_index - find the child's index in the array of subvolumes + * @this: AFR + * @child: child + */ + +static int +find_child_index (xlator_t *this, xlator_t *child) +{ + afr_private_t *priv = NULL; + + int i = -1; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if ((xlator_t *) child == priv->children[i]) + break; + } + + return i; +} + + +int32_t +notify (xlator_t *this, int32_t event, + void *data, ...) +{ + afr_private_t * priv = NULL; + unsigned char * child_up = NULL; + + int i = -1; + int up_children = 0; + + priv = this->private; + + if (!priv) + return 0; + + child_up = priv->child_up; + + switch (event) { + case GF_EVENT_CHILD_UP: + i = find_child_index (this, data); + + child_up[i] = 1; + + /* + if all the children were down, and one child came up, + send notify to parent + */ + + for (i = 0; i < priv->child_count; i++) + if (child_up[i]) + up_children++; + + if (up_children == 1) + default_notify (this, event, data); + + break; + + case GF_EVENT_CHILD_DOWN: + i = find_child_index (this, data); + + child_up[i] = 0; + + /* + if all children are down, and this was the last to go down, + send notify to parent + */ + + for (i = 0; i < priv->child_count; i++) + if (child_up[i]) + up_children++; + + if (up_children == 0) + default_notify (this, event, data); + + break; + + default: + default_notify (this, event, data); + } + + return 0; +} + + +static const char *favorite_child_warning_str = "You have specified subvolume '%s' " + "as the 'favorite child'. This means that if a discrepancy in the content " + "or attributes (ownership, permission, etc.) of a file is detected among " + "the subvolumes, the file on '%s' will be considered the definitive " + "version and its contents will OVERWRITE the contents of the file on other " + "subvolumes. All versions of the file except that on '%s' " + "WILL BE LOST."; + +static const char *no_lock_servers_warning_str = "You have set lock-server-count = 0. " + "This means correctness is NO LONGER GUARANTEED in all cases. If two or more " + "applications write to the same region of a file, there is a possibility that " + "its copies will be INCONSISTENT. Set it to a value greater than 0 unless you " + "are ABSOLUTELY SURE of what you are doing and WILL NOT HOLD GlusterFS " + "RESPOSIBLE for inconsistent data. If you are in doubt, set it to a value " + "greater than 0."; + +int32_t +init (xlator_t *this) +{ + afr_private_t * priv = NULL; + int child_count = 0; + xlator_list_t * trav = NULL; + int i = 0; + int ret = -1; + int op_errno = 0; + + char * read_subvol = NULL; + char * fav_child = NULL; + char * self_heal = NULL; + char * change_log = NULL; + + int32_t lock_server_count = 1; + + int fav_ret = -1; + int read_ret = -1; + int dict_ret = -1; + + if (!this->children) { + gf_log (this->name, GF_LOG_ERROR, + "AFR needs more than one child defined"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + ALLOC_OR_GOTO (this->private, afr_private_t, out); + + priv = this->private; + + read_ret = dict_get_str (this->options, "read-subvolume", &read_subvol); + priv->read_child = -1; + + fav_ret = dict_get_str (this->options, "favorite-child", &fav_child); + priv->favorite_child = -1; + + /* Default values */ + + priv->data_self_heal = 1; + priv->metadata_self_heal = 1; + priv->entry_self_heal = 1; + + dict_ret = dict_get_str (this->options, "data-self-heal", &self_heal); + if (dict_ret == 0) { + ret = gf_string2boolean (self_heal, &priv->data_self_heal); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option data-self-heal %s' " + "defaulting to data-self-heal as 'on'", + self_heal); + priv->data_self_heal = 1; + } + } + + dict_ret = dict_get_str (this->options, "metadata-self-heal", + &self_heal); + if (dict_ret == 0) { + ret = gf_string2boolean (self_heal, &priv->metadata_self_heal); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option metadata-self-heal %s' " + "defaulting to metadata-self-heal as 'on'", + self_heal); + priv->metadata_self_heal = 1; + } + } + + dict_ret = dict_get_str (this->options, "entry-self-heal", &self_heal); + if (dict_ret == 0) { + ret = gf_string2boolean (self_heal, &priv->entry_self_heal); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option entry-self-heal %s' " + "defaulting to entry-self-heal as 'on'", + self_heal); + priv->entry_self_heal = 1; + } + } + + /* Change log options */ + + priv->data_change_log = 1; + priv->metadata_change_log = 0; + priv->entry_change_log = 1; + + dict_ret = dict_get_str (this->options, "data-change-log", + &change_log); + if (dict_ret == 0) { + ret = gf_string2boolean (change_log, &priv->data_change_log); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option data-change-log %s'. " + "defaulting to data-change-log as 'on'", + change_log); + priv->data_change_log = 1; + } + } + + dict_ret = dict_get_str (this->options, "metadata-change-log", + &change_log); + if (dict_ret == 0) { + ret = gf_string2boolean (change_log, + &priv->metadata_change_log); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option metadata-change-log %s'. " + "defaulting to metadata-change-log as 'off'", + change_log); + priv->metadata_change_log = 0; + } + } + + dict_ret = dict_get_str (this->options, "entry-change-log", + &change_log); + if (dict_ret == 0) { + ret = gf_string2boolean (change_log, &priv->entry_change_log); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option entry-change-log %s'. " + "defaulting to entry-change-log as 'on'", + change_log); + priv->entry_change_log = 1; + } + } + + /* Locking options */ + + priv->data_lock_server_count = 1; + priv->metadata_lock_server_count = 0; + priv->entry_lock_server_count = 1; + + dict_ret = dict_get_int32 (this->options, "data-lock-server-count", + &lock_server_count); + if (dict_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "setting data lock server count to %d", + lock_server_count); + + if (lock_server_count == 0) + gf_log (this->name, GF_LOG_WARNING, + no_lock_servers_warning_str); + + priv->data_lock_server_count = lock_server_count; + } + + + dict_ret = dict_get_int32 (this->options, + "metadata-lock-server-count", + &lock_server_count); + if (dict_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "setting metadata lock server count to %d", + lock_server_count); + priv->metadata_lock_server_count = lock_server_count; + } + + + dict_ret = dict_get_int32 (this->options, "entry-lock-server-count", + &lock_server_count); + if (dict_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "setting entry lock server count to %d", + lock_server_count); + + priv->entry_lock_server_count = lock_server_count; + } + + + trav = this->children; + while (trav) { + if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) { + gf_log (this->name, GF_LOG_DEBUG, + "subvolume '%s' specified as read child", + trav->xlator->name); + + priv->read_child = child_count; + } + + if (fav_ret == 0 && !strcmp (fav_child, trav->xlator->name)) { + gf_log (this->name, GF_LOG_WARNING, + favorite_child_warning_str, trav->xlator->name, + trav->xlator->name, trav->xlator->name); + priv->favorite_child = child_count; + } + + child_count++; + trav = trav->next; + } + + /* XXX: return inode numbers from 1st subvolume till + afr supports read-subvolume based on inode's ctx + (and not itransform) for this reason afr_deitransform() + returns 0 always + */ + priv->read_child = 0; + + priv->wait_count = 1; + + priv->child_count = child_count; + LOCK_INIT (&priv->lock); + + priv->child_up = CALLOC (sizeof (unsigned char), child_count); + if (!priv->child_up) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + op_errno = ENOMEM; + goto out; + } + + priv->children = CALLOC (sizeof (xlator_t *), child_count); + if (!priv->children) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + op_errno = ENOMEM; + goto out; + } + + trav = this->children; + i = 0; + while (i < child_count) { + priv->children[i] = trav->xlator; + + trav = trav->next; + i++; + } + + ret = 0; +out: + return ret; +} + + +int +fini (xlator_t *this) +{ + return 0; +} + + +struct xlator_fops fops = { + .lookup = afr_lookup, + .open = afr_open, + .lk = afr_lk, + .flush = afr_flush, + .statfs = afr_statfs, + .fsync = afr_fsync, + .fsyncdir = afr_fsyncdir, + .xattrop = afr_xattrop, + .fxattrop = afr_fxattrop, + .inodelk = afr_inodelk, + .finodelk = afr_finodelk, + .entrylk = afr_entrylk, + .fentrylk = afr_fentrylk, + .checksum = afr_checksum, + + /* inode read */ + .access = afr_access, + .stat = afr_stat, + .fstat = afr_fstat, + .readlink = afr_readlink, + .getxattr = afr_getxattr, + .readv = afr_readv, + + /* inode write */ + .chmod = afr_chmod, + .chown = afr_chown, + .fchmod = afr_fchmod, + .fchown = afr_fchown, + .writev = afr_writev, + .truncate = afr_truncate, + .ftruncate = afr_ftruncate, + .utimens = afr_utimens, + .setxattr = afr_setxattr, + .removexattr = afr_removexattr, + + /* dir read */ + .opendir = afr_opendir, + .readdir = afr_readdir, + .getdents = afr_getdents, + + /* dir write */ + .create = afr_create, + .mknod = afr_mknod, + .mkdir = afr_mkdir, + .unlink = afr_unlink, + .rmdir = afr_rmdir, + .link = afr_link, + .symlink = afr_symlink, + .rename = afr_rename, + .setdents = afr_setdents, +}; + + +struct xlator_mops mops = { +}; + + +struct xlator_cbks cbks = { +}; + +struct volume_options options[] = { + { .key = {"read-subvolume" }, + .type = GF_OPTION_TYPE_XLATOR + }, + { .key = {"favorite-child"}, + .type = GF_OPTION_TYPE_XLATOR + }, + { .key = {"data-self-heal"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"metadata-self-heal"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"entry-self-heal"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"data-change-log"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"metadata-change-log"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"entry-change-log"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"data-lock-server-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 0 + }, + { .key = {"metadata-lock-server-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 0 + }, + { .key = {"entry-lock-server-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 0 + }, + { .key = {NULL} }, +}; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h new file mode 100644 index 000000000..4cf6cdf9d --- /dev/null +++ b/xlators/cluster/afr/src/afr.h @@ -0,0 +1,523 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#ifndef __AFR_H__ +#define __AFR_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "scheduler.h" +#include "call-stub.h" +#include "compat-errno.h" + + +typedef struct _afr_private { + gf_lock_t lock; /* to guard access to child_count, etc */ + unsigned int child_count; /* total number of children */ + + xlator_t **children; + + unsigned char *child_up; + + gf_boolean_t data_self_heal; /* on/off */ + gf_boolean_t metadata_self_heal; /* on/off */ + gf_boolean_t entry_self_heal; /* on/off */ + + + gf_boolean_t data_change_log; /* on/off */ + gf_boolean_t metadata_change_log; /* on/off */ + gf_boolean_t entry_change_log; /* on/off */ + + unsigned int read_child; /* read-subvolume */ + unsigned int favorite_child; /* subvolume to be preferred in resolving + split-brain cases */ + + unsigned int data_lock_server_count; + unsigned int metadata_lock_server_count; + unsigned int entry_lock_server_count; + + unsigned int wait_count; /* # of servers to wait for success */ +} afr_private_t; + +typedef struct { + /* array of stat's, one for each child */ + struct stat *buf; + + /* array of xattr's, one for each child */ + dict_t **xattr; + + /* array of errno's, one for each child */ + int *child_errno; + + int32_t **pending_matrix; + int32_t **delta_matrix; + + int *sources; + int source; + int active_source; + int active_sinks; + int *success; + + fd_t *healing_fd; + int op_failed; + + int file_has_holes; + blksize_t block_size; + off_t file_size; + off_t offset; + + loc_t parent_loc; + int (*completion_cbk) (call_frame_t *frame, xlator_t *this); + call_frame_t *sh_frame; +} afr_self_heal_t; + + +typedef enum { + AFR_DATA_TRANSACTION, /* truncate, write, ... */ + AFR_METADATA_TRANSACTION, /* chmod, chown, ... */ + AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */ + AFR_ENTRY_RENAME_TRANSACTION, /* rename */ + AFR_FLUSH_TRANSACTION, /* flush */ +} afr_transaction_type; + +typedef struct _afr_local { + unsigned int call_count; + unsigned int success_count; + unsigned int enoent_count; + + unsigned int need_metadata_self_heal; + unsigned int need_entry_self_heal; + unsigned int need_data_self_heal; + unsigned int govinda_gOvinda; + + unsigned int reval_child_index; + int32_t op_ret; + int32_t op_errno; + + int32_t *pending_array; + + loc_t loc; + loc_t newloc; + + fd_t *fd; + + glusterfs_fop_t fop; + + unsigned char *child_up; + int child_count; + + int32_t *child_errno; + + dict_t *xattr_req; + int open_fd_count; + /* + This struct contains the arguments for the "continuation" + (scheme-like) of fops + */ + + int op; + struct { + struct { + unsigned char buf_set; + struct statvfs buf; + } statfs; + + struct { + inode_t *inode; + struct stat buf; + dict_t *xattr; + } lookup; + + struct { + int32_t flags; + } open; + + struct { + int32_t cmd; + struct flock flock; + unsigned char *locked_nodes; + } lk; + + struct { + uint8_t *file_checksum; + uint8_t *dir_checksum; + } checksum; + + /* inode read */ + + struct { + int32_t mask; + int last_tried; /* index of the child we tried previously */ + } access; + + struct { + int last_tried; + ino_t ino; + } stat; + + struct { + int last_tried; + ino_t ino; + } fstat; + + struct { + size_t size; + int last_tried; + } readlink; + + struct { + const char *name; + int last_tried; + } getxattr; + + struct { + size_t size; + off_t offset; + int last_tried; + } readv; + + /* dir read */ + + struct { + int success_count; + int32_t op_ret; + int32_t op_errno; + } opendir; + + struct { + int32_t op_ret; + int32_t op_errno; + size_t size; + off_t offset; + + int last_tried; + } readdir; + + struct { + int32_t op_ret; + int32_t op_errno; + + size_t size; + off_t offset; + int32_t flag; + + int last_tried; + } getdents; + + /* inode write */ + + struct { + ino_t ino; + mode_t mode; + struct stat buf; + } chmod; + + struct { + ino_t ino; + mode_t mode; + struct stat buf; + } fchmod; + + struct { + ino_t ino; + uid_t uid; + gid_t gid; + struct stat buf; + } chown; + + struct { + ino_t ino; + uid_t uid; + gid_t gid; + struct stat buf; + } fchown; + + struct { + ino_t ino; + struct stat buf; + + int32_t op_ret; + + struct iovec *vector; + dict_t *refs; + int32_t count; + off_t offset; + } writev; + + struct { + ino_t ino; + off_t offset; + struct stat buf; + } truncate; + + struct { + ino_t ino; + off_t offset; + struct stat buf; + } ftruncate; + + struct { + ino_t ino; + struct timespec tv[2]; + struct stat buf; + } utimens; + + struct { + dict_t *dict; + int32_t flags; + } setxattr; + + struct { + const char *name; + } removexattr; + + /* dir write */ + + struct { + ino_t ino; + fd_t *fd; + int32_t flags; + mode_t mode; + inode_t *inode; + struct stat buf; + } create; + + struct { + ino_t ino; + dev_t dev; + mode_t mode; + inode_t *inode; + struct stat buf; + } mknod; + + struct { + ino_t ino; + int32_t mode; + inode_t *inode; + struct stat buf; + } mkdir; + + struct { + int32_t op_ret; + int32_t op_errno; + } unlink; + + struct { + int32_t op_ret; + int32_t op_errno; + } rmdir; + + struct { + ino_t ino; + struct stat buf; + } rename; + + struct { + ino_t ino; + inode_t *inode; + struct stat buf; + } link; + + struct { + ino_t ino; + inode_t *inode; + struct stat buf; + char *linkpath; + } symlink; + + struct { + int32_t flags; + dir_entry_t *entries; + int32_t count; + } setdents; + } cont; + + struct { + off_t start, len; + + unsigned char *locked_nodes; + int lock_count; + + const char *basename; + const char *new_basename; + + char *pending; + + loc_t parent_loc; + loc_t new_parent_loc; + + afr_transaction_type type; + + int success_count; + int erase_pending; + int failure_count; + + int last_tried; + int32_t *child_errno; + + call_frame_t *main_frame; + + int (*fop) (call_frame_t *frame, xlator_t *this); + + int (*done) (call_frame_t *frame, xlator_t *this); + + int (*resume) (call_frame_t *frame, xlator_t *this); + + int (*unwind) (call_frame_t *frame, xlator_t *this); + } transaction; + + afr_self_heal_t self_heal; +} afr_local_t; + +/* try alloc and if it fails, goto label */ +#define ALLOC_OR_GOTO(var, type, label) do { \ + var = CALLOC (sizeof (type), 1); \ + if (!var) { \ + gf_log (this->name, GF_LOG_ERROR, \ + "out of memory :("); \ + op_errno = ENOMEM; \ + goto label; \ + } \ + } while (0); + + +/* did a call fail due to a child failing? */ +#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \ + ((op_errno == ENOTCONN) || \ + (op_errno == EBADFD))) + +/* have we tried all children? */ +#define all_tried(i, count) ((i) == (count) - 1) + +void +afr_build_parent_loc (loc_t *parent, loc_t *child); + +int +afr_up_children_count (int child_count, unsigned char *child_up); + +int +afr_locked_nodes_count (unsigned char *locked_nodes, int child_count); + +int +afr_first_up_child (afr_private_t *priv); + +ino64_t +afr_itransform (ino64_t ino, int child_count, int child_index); + +int +afr_deitransform (ino64_t ino, int child_count); + +void +afr_local_cleanup (afr_local_t *local, xlator_t *this); + +int +afr_frame_return (call_frame_t *frame); + +#define AFR_STACK_UNWIND(frame, params ...) \ + do { \ + afr_local_t *__local = NULL; \ + xlator_t *__this = NULL; \ + __local = frame->local; \ + __this = frame->this; \ + frame->local = NULL; \ + STACK_UNWIND (frame, params); \ + afr_local_cleanup (__local, __this); \ + free (__local); \ +} while (0); + +#define AFR_STACK_DESTROY(frame) \ + do { \ + afr_local_t *__local = NULL; \ + xlator_t *__this = NULL; \ + __local = frame->local; \ + __this = frame->this; \ + frame->local = NULL; \ + STACK_DESTROY (frame->root); \ + afr_local_cleanup (__local, __this); \ + free (__local); \ +} while (0); + +/* allocate and return a string that is the basename of argument */ +static inline char * +AFR_BASENAME (const char *str) +{ + char *__tmp_str = NULL; + char *__basename_str = NULL; + __tmp_str = strdup (str); + __basename_str = strdup (basename (__tmp_str)); + FREE (__tmp_str); + return __basename_str; +} + +/* initialize local_t */ +static inline int +AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv) +{ + local->child_up = CALLOC (sizeof (*local->child_up), + priv->child_count); + if (!local->child_up) { + return -ENOMEM; + } + + memcpy (local->child_up, priv->child_up, + sizeof (*local->child_up) * priv->child_count); + + + local->call_count = afr_up_children_count (priv->child_count, local->child_up); + if (local->call_count == 0) + return -ENOTCONN; + + local->transaction.erase_pending = 1; + + local->op_ret = -1; + local->op_errno = EUCLEAN; + + return 0; +} + + +static inline int +afr_transaction_local_init (afr_local_t *local, afr_private_t *priv) +{ + local->child_errno = CALLOC (sizeof (*local->child_errno), + priv->child_count); + if (!local->child_errno) { + return -ENOMEM; + } + + local->pending_array = CALLOC (sizeof (*local->pending_array), + priv->child_count); + if (!local->pending_array) { + return -ENOMEM; + } + + local->transaction.locked_nodes = CALLOC (sizeof (*local->transaction.locked_nodes), + priv->child_count); + + local->transaction.child_errno = CALLOC (sizeof (*local->transaction.child_errno), + priv->child_count); + + return 0; +} + +#endif /* __AFR_H__ */ diff --git a/xlators/cluster/dht/Makefile.am b/xlators/cluster/dht/Makefile.am new file mode 100644 index 000000000..f963effea --- /dev/null +++ b/xlators/cluster/dht/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src
\ No newline at end of file diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am new file mode 100644 index 000000000..b7d07d137 --- /dev/null +++ b/xlators/cluster/dht/src/Makefile.am @@ -0,0 +1,30 @@ + +xlator_LTLIBRARIES = dht.la nufa.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + + +dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c \ + dht-selfheal.c dht-rename.c dht-hashfn.c dht-hashfn-tea.c + +dht_la_SOURCES = $(dht_common_source) dht.c + +nufa_la_SOURCES = $(dht_common_source) nufa.c + +dht_la_LDFLAGS = -module -avoidversion +dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +nufa_la_LDFLAGS = -module -avoidversion +nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = dht-common.h dht-common.c + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + +uninstall-local: + rm -f $(DESTDIR)$(xlatordir)/distribute.so + +install-data-hook: + ln -sf dht.so $(DESTDIR)$(xlatordir)/distribute.so
\ No newline at end of file diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c new file mode 100644 index 000000000..5e4979e31 --- /dev/null +++ b/xlators/cluster/dht/src/dht-common.c @@ -0,0 +1,3470 @@ +/* + Copyright (c) 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +/* TODO: add NS locking */ + +#include "glusterfs.h" +#include "xlator.h" +#include "dht-common.h" +#include "defaults.h" + + +/* TODO: + - use volumename in xattr instead of "dht" + - use NS locks + - handle all cases in self heal layout reconstruction + - complete linkfile selfheal +*/ + +int +dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int op_ret, int op_errno) +{ + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int ret = 0; + + local = frame->local; + ret = op_ret; + + if (ret == 0) { + layout = local->selfheal.layout; + ret = inode_ctx_put (local->inode, this, (uint64_t)(long)layout); + + if (ret == 0) + local->selfheal.layout = NULL; + + if (local->st_ino) { + local->stbuf.st_ino = local->st_ino; + } else { + gf_log (this->name, GF_LOG_WARNING, + "could not find hashed subvolume for %s", + local->loc.path); + } + } + + DHT_STACK_UNWIND (frame, ret, local->op_errno, local->inode, + &local->stbuf, local->xattr); + + return 0; +} + + +int +dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf, dict_t *xattr) +{ + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = 0; + int is_dir = 0; + + conf = this->private; + local = frame->local; + prev = cookie; + + layout = local->layout; + + LOCK (&frame->lock); + { + /* TODO: assert equal mode on stbuf->st_mode and + local->stbuf->st_mode + + else mkdir/chmod/chown and fix + */ + /* TODO: assert equal hash type in xattr, local->xattr */ + + /* TODO: always ensure same subvolume is in layout->list[0] */ + + ret = dht_layout_merge (this, layout, prev->this, + op_ret, op_errno, xattr); + + if (op_ret == -1) { + local->op_errno = ENOENT; + gf_log (this->name, GF_LOG_WARNING, + "lookup of %s on %s returned error (%s)", + local->loc.path, prev->this->name, + strerror (op_errno)); + + goto unlock; + } + + is_dir = check_is_dir (inode, stbuf, xattr); + if (!is_dir) + goto unlock; + + local->op_ret = 0; + if (local->xattr == NULL) + local->xattr = dict_ref (xattr); + if (local->inode == NULL) + local->inode = inode_ref (inode); + + dht_stat_merge (this, &local->stbuf, stbuf, prev->this); + + if (prev->this == local->hashed_subvol) + local->st_ino = local->stbuf.st_ino; + + } +unlock: + UNLOCK (&frame->lock); + + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + if (local->op_ret == 0) { + ret = dht_layout_normalize (this, &local->loc, layout); + + local->layout = NULL; + + if (ret != 0) { + layout->gen = conf->gen; + + gf_log (this->name, GF_LOG_WARNING, + "fixing assignment on %s", + local->loc.path); + goto selfheal; + } + + inode_ctx_put (local->inode, this, (uint64_t)(long)layout); + + if (local->st_ino) { + local->stbuf.st_ino = local->st_ino; + } else { + gf_log (this->name, GF_LOG_WARNING, + "could not find hashed subvolume for %s", + local->loc.path); + } + } + + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr); + } + + return 0; + +selfheal: + ret = dht_selfheal_directory (frame, dht_lookup_selfheal_cbk, + &local->loc, layout); + + return 0; +} + +int +dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf, dict_t *xattr) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = -1; + int is_dir = 0; + int is_linkfile = 0; + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + + if (op_errno != ENOTCONN && op_errno != ENOENT) { + gf_log (this->name, GF_LOG_WARNING, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + } + + goto unlock; + } + + if (S_IFMT & (stbuf->st_mode ^ local->inode->st_mode)) { + gf_log (this->name, GF_LOG_WARNING, + "mismatching filetypes 0%o v/s 0%o for %s", + (stbuf->st_mode & S_IFMT), + (local->inode->st_mode & S_IFMT), + local->loc.path); + + local->op_ret = -1; + local->op_errno = EINVAL; + + goto unlock; + } + + layout = dht_layout_get (this, inode); + + is_dir = check_is_dir (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr); + + if (is_linkfile) { + gf_log (this->name, GF_LOG_WARNING, + "linkfile found in revalidate for %s", + local->loc.path); + local->layout_mismatch = 1; + + goto unlock; + } + + if (is_dir) { + ret = dht_layout_dir_mismatch (this, layout, + prev->this, &local->loc, + xattr); + if (ret != 0) { + gf_log (this->name, GF_LOG_WARNING, + "mismatching layouts for %s", + local->loc.path); + + local->layout_mismatch = 1; + + goto unlock; + } + } + + dht_stat_merge (this, &local->stbuf, stbuf, prev->this); + + local->op_ret = 0; + local->stbuf.st_ino = local->st_ino; + + if (!local->xattr) + local->xattr = dict_ref (xattr); + } +unlock: + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + if (!S_ISDIR (local->stbuf.st_mode) + && (local->hashed_subvol != local->cached_subvol) + && (local->stbuf.st_nlink == 1)) + local->stbuf.st_mode |= S_ISVTX; + + if (local->layout_mismatch) { + local->op_ret = -1; + local->op_errno = ESTALE; + } + + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr); + } + + return 0; +} + + +int +dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + xlator_t *cached_subvol = NULL; + + local = frame->local; + cached_subvol = local->cached_subvol; + + layout = dht_layout_for_subvol (this, local->cached_subvol); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no pre-set layout for subvolume %s", + cached_subvol ? cached_subvol->name : "<nil>"); + local->op_ret = -1; + local->op_errno = EINVAL; + goto unwind; + } + + inode_ctx_put (local->inode, this, (uint64_t)(long)layout); + local->op_ret = 0; + if (local->stbuf.st_nlink == 1) + local->stbuf.st_mode |= S_ISVTX; + +unwind: + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr); + return 0; +} + + +int +dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + int is_linkfile = 0; + int is_dir = 0; + xlator_t *subvol = NULL; + loc_t *loc = NULL; + xlator_t *link_subvol = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *cached_subvol = NULL; + + conf = this->private; + + local = frame->local; + loc = &local->loc; + + prev = cookie; + subvol = prev->this; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + if (op_errno != ENOENT) + local->op_errno = op_errno; + goto unlock; + } + + is_linkfile = check_is_linkfile (inode, buf, xattr); + is_dir = check_is_dir (inode, buf, xattr); + + if (is_linkfile) { + link_subvol = dht_linkfile_subvol (this, inode, buf, + xattr); + gf_log (this->name, GF_LOG_DEBUG, + "found on %s linkfile %s (-> %s)", + subvol->name, loc->path, + link_subvol ? link_subvol->name : "''"); + goto unlock; + } else { + gf_log (this->name, GF_LOG_DEBUG, + "found on %s file %s", + subvol->name, loc->path); + } + + if (!local->cached_subvol) { + /* found one file */ + dht_stat_merge (this, &local->stbuf, buf, subvol); + local->xattr = dict_ref (xattr); + local->cached_subvol = subvol; + } else { + gf_log (this->name, GF_LOG_WARNING, + "multiple subvolumes (%s and %s atleast) have " + "file %s", local->cached_subvol->name, + subvol->name, local->loc.path); + } + } +unlock: + UNLOCK (&frame->lock); + + if (is_linkfile) { + gf_log (this->name, GF_LOG_WARNING, + "deleting stale linkfile %s on %s", + loc->path, subvol->name); + dht_linkfile_unlink (frame, this, subvol, loc); + } + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + hashed_subvol = local->hashed_subvol; + cached_subvol = local->cached_subvol; + + if (!cached_subvol) { + DHT_STACK_UNWIND (frame, -1, ENOENT, NULL, NULL, NULL); + return 0; + } + + gf_log (this->name, GF_LOG_WARNING, + "linking file %s existing on %s to %s (hash)", + loc->path, cached_subvol->name, hashed_subvol->name); + + dht_linkfile_create (frame, dht_lookup_linkfile_create_cbk, + cached_subvol, hashed_subvol, loc); + } + + return 0; +} + + +int +dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int i = 0; + int call_cnt = 0; + + conf = this->private; + local = frame->local; + + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + if (!local->inode) + local->inode = inode_ref (loc->inode); + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_lookup_everywhere_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + loc, local->xattr_req); + } + + return 0; +} + + +int +dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf, dict_t *xattr) +{ + call_frame_t *prev = NULL; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + xlator_t *subvol = NULL; + loc_t *loc = NULL; + + prev = cookie; + subvol = prev->this; + + local = frame->local; + loc = &local->loc; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "lookup of %s on %s (following linkfile) failed (%s)", + local->loc.path, subvol->name, strerror (op_errno)); + + dht_lookup_everywhere (frame, this, loc); + return 0; + } + + /* TODO: assert type is non-dir and non-linkfile */ + + if (stbuf->st_nlink == 1) + stbuf->st_mode |= S_ISVTX; + dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino); + + layout = dht_layout_for_subvol (this, prev->this); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no pre-set layout for subvolume %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + inode_ctx_put (inode, this, (uint64_t)(long)layout); + +out: + DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, xattr); + + return 0; +} + + +int +dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf, dict_t *xattr) +{ + dht_layout_t *layout = NULL; + char is_linkfile = 0; + char is_dir = 0; + xlator_t *subvol = NULL; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + loc_t *loc = NULL; + int i = 0; + call_frame_t *prev = NULL; + int call_cnt = 0; + + + conf = this->private; + + prev = cookie; + local = frame->local; + loc = &local->loc; + + if (ENTRY_MISSING (op_ret, op_errno)) { + if (conf->search_unhashed) { + local->op_errno = ENOENT; + dht_lookup_everywhere (frame, this, loc); + return 0; + } + } + + if (op_ret == 0) { + is_dir = check_is_dir (inode, stbuf, xattr); + if (is_dir) { + local->inode = inode_ref (inode); + local->xattr = dict_ref (xattr); + } + } + + if (is_dir || (op_ret == -1 && op_errno == ENOTCONN)) { + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + local->layout = dht_layout_new (this, conf->subvolume_cnt); + if (!local->layout) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto out; + } + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_lookup_dir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + &local->loc, local->xattr_req); + } + return 0; + } + + if (op_ret == -1) + goto out; + + is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_dir = check_is_dir (inode, stbuf, xattr); + + if (!is_dir && !is_linkfile) { + /* non-directory and not a linkfile */ + + dht_itransform (this, prev->this, stbuf->st_ino, + &stbuf->st_ino); + + layout = dht_layout_for_subvol (this, prev->this); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no pre-set layout for subvolume %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + inode_ctx_put (inode, this, (uint64_t)(long)layout); + goto out; + } + + if (is_linkfile) { + subvol = dht_linkfile_subvol (this, inode, stbuf, xattr); + + if (!subvol) { + gf_log (this->name, GF_LOG_WARNING, + "linkfile not having link subvolume. path=%s", + loc->path); + dht_lookup_everywhere (frame, this, loc); + return 0; + } + + STACK_WIND (frame, dht_lookup_linkfile_cbk, + subvol, subvol->fops->lookup, + &local->loc, local->xattr_req); + } + + return 0; + +out: + DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, xattr); + return 0; +} + + +int +dht_lookup (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *xattr_req) +{ + xlator_t *subvol = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *cached_subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int op_errno = -1; + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + ret = loc_dup (loc, &local->loc); + if (ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "copying location failed for path=%s", + loc->path); + goto err; + } + + if (xattr_req) { + local->xattr_req = dict_ref (xattr_req); + } else { + local->xattr_req = dict_new (); + } + + hashed_subvol = dht_subvol_get_hashed (this, loc); + cached_subvol = dht_subvol_get_cached (this, loc->inode); + + local->cached_subvol = cached_subvol; + local->hashed_subvol = hashed_subvol; + + if (is_revalidate (loc)) { + layout = dht_layout_get (this, loc->inode); + + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "revalidate without cache. path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + + if (layout->gen && (layout->gen < conf->gen)) { + gf_log (this->name, GF_LOG_WARNING, + "incomplete layout failure for path=%s", + loc->path); + op_errno = EAGAIN; + goto err; + } + + local->inode = inode_ref (loc->inode); + local->st_ino = loc->inode->ino; + + local->call_cnt = layout->cnt; + call_cnt = local->call_cnt; + + /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute, + * revalidates directly go to the cached-subvolume. + */ + ret = dict_set_uint32 (local->xattr_req, + "trusted.glusterfs.dht", 4 * 4); + + for (i = 0; i < layout->cnt; i++) { + subvol = layout->list[i].xlator; + + STACK_WIND (frame, dht_revalidate_cbk, + subvol, subvol->fops->lookup, + loc, local->xattr_req); + + if (!--call_cnt) + break; + } + } else { + /* TODO: remove the hard-coding */ + ret = dict_set_uint32 (local->xattr_req, + "trusted.glusterfs.dht", 4 * 4); + + ret = dict_set_uint32 (local->xattr_req, + "trusted.glusterfs.dht.linkto", 256); + + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s, " + "checking on all the subvols to see if " + "it is a directory", loc->path); + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + local->layout = dht_layout_new (this, conf->subvolume_cnt); + if (!local->layout) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_lookup_dir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + &local->loc, local->xattr_req); + } + return 0; + } + + STACK_WIND (frame, dht_lookup_cbk, + hashed_subvol, hashed_subvol->fops->lookup, + loc, local->xattr_req); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + + +int +dht_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct stat *stbuf) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto unlock; + } + + dht_stat_merge (this, &local->stbuf, stbuf, prev->this); + + if (local->inode) + local->stbuf.st_ino = local->inode->ino; + local->op_ret = 0; + } +unlock: + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + + return 0; +} + + +int +dht_stat (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int i = 0; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + layout = dht_layout_get (this, loc->inode); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = layout->cnt; + + for (i = 0; i < layout->cnt; i++) { + subvol = layout->list[i].xlator; + + STACK_WIND (frame, dht_attr_cbk, + subvol, subvol->fops->stat, + loc); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_fstat (call_frame_t *frame, xlator_t *this, + fd_t *fd) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int i = 0; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + layout = dht_layout_get (this, fd->inode); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no layout for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "local allocation failed :("); + goto err; + } + + local->inode = inode_ref (fd->inode); + local->call_cnt = layout->cnt;; + + for (i = 0; i < layout->cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND (frame, dht_attr_cbk, + subvol, subvol->fops->fstat, + fd); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_chmod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode) +{ + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + layout = dht_layout_get (this, loc->inode); + + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane (layout)) { + gf_log (this->name, GF_LOG_ERROR, + "layout is not sane for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = layout->cnt; + + for (i = 0; i < layout->cnt; i++) { + STACK_WIND (frame, dht_attr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->chmod, + loc, mode); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_chown (call_frame_t *frame, xlator_t *this, + loc_t *loc, uid_t uid, gid_t gid) +{ + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + layout = dht_layout_get (this, loc->inode); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane (layout)) { + gf_log (this->name, GF_LOG_ERROR, + "layout is not sane for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = layout->cnt; + + for (i = 0; i < layout->cnt; i++) { + STACK_WIND (frame, dht_attr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->chown, + loc, uid, gid); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_fchmod (call_frame_t *frame, xlator_t *this, + fd_t *fd, mode_t mode) +{ + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + + layout = dht_layout_get (this, fd->inode); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no layout for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane (layout)) { + gf_log (this->name, GF_LOG_ERROR, + "layout is not sane for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (fd->inode); + local->call_cnt = layout->cnt; + + for (i = 0; i < layout->cnt; i++) { + STACK_WIND (frame, dht_attr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->fchmod, + fd, mode); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_fchown (call_frame_t *frame, xlator_t *this, + fd_t *fd, uid_t uid, gid_t gid) +{ + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + layout = dht_layout_get (this, fd->inode); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no layout for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane (layout)) { + gf_log (this->name, GF_LOG_ERROR, + "layout is not sane for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (fd->inode); + local->call_cnt = layout->cnt; + + for (i = 0; i < layout->cnt; i++) { + STACK_WIND (frame, dht_attr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->fchown, + fd, uid, gid); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_utimens (call_frame_t *frame, xlator_t *this, + loc_t *loc, struct timespec tv[2]) +{ + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + layout = dht_layout_get (this, loc->inode); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane (layout)) { + gf_log (this->name, GF_LOG_ERROR, + "layout is not sane for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = layout->cnt; + + for (i = 0; i < layout->cnt; i++) { + STACK_WIND (frame, dht_attr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->utimens, + loc, tv); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_truncate (call_frame_t *frame, xlator_t *this, + loc_t *loc, off_t offset) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = 1; + + STACK_WIND (frame, dht_attr_cbk, + subvol, subvol->fops->truncate, + loc, offset); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_ftruncate (call_frame_t *frame, xlator_t *this, + fd_t *fd, off_t offset) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (fd->inode); + local->call_cnt = 1; + + STACK_WIND (frame, dht_attr_cbk, + subvol, subvol->fops->ftruncate, + fd, offset); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto unlock; + } + + local->op_ret = 0; + } +unlock: + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +dht_access (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t mask) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->call_cnt = 1; + + STACK_WIND (frame, dht_err_cbk, + subvol, subvol->fops->access, + loc, mask); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +int +dht_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, const char *path) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, path); + + return 0; +} + + +int +dht_readlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, size_t size) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_readlink_cbk, + subvol, subvol->fops->readlink, + loc, size); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, xattr); + + return 0; +} + + +int +dht_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *key) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_getxattr_cbk, + subvol, subvol->fops->getxattr, + loc, key); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_setxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *xattr, int flags) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->call_cnt = 1; + + STACK_WIND (frame, dht_err_cbk, + subvol, subvol->fops->setxattr, + loc, xattr, flags); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *key) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->call_cnt = 1; + + STACK_WIND (frame, dht_err_cbk, + subvol, subvol->fops->removexattr, + loc, key); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto unlock; + } + + local->op_ret = 0; + } +unlock: + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->fd); + + return 0; +} + + +int +dht_open (call_frame_t *frame, xlator_t *this, + loc_t *loc, int flags, fd_t *fd) +{ + xlator_t *subvol = NULL; + int ret = -1; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->fd = fd_ref (fd); + ret = loc_dup (loc, &local->loc); + if (ret == -1) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->call_cnt = 1; + + STACK_WIND (frame, dht_fd_cbk, + subvol, subvol->fops->open, + loc, flags, fd); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + struct iovec *vector, int count, struct stat *stbuf) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf); + + return 0; +} + + +int +dht_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t off) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_readv_cbk, + subvol, subvol->fops->readv, + fd, size, off); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL); + + return 0; +} + + +int +dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct stat *stbuf) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, stbuf); + + return 0; +} + + +int +dht_writev (call_frame_t *frame, xlator_t *this, + fd_t *fd, struct iovec *vector, int count, off_t off) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_writev_cbk, + subvol, subvol->fops->writev, + fd, vector, count, off); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, 0); + + return 0; +} + + +int +dht_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->fd = fd_ref (fd); + local->call_cnt = 1; + + STACK_WIND (frame, dht_err_cbk, + subvol, subvol->fops->flush, fd); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +int +dht_fsync (call_frame_t *frame, xlator_t *this, + fd_t *fd, int datasync) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocatoin failed :("); + goto err; + } + local->call_cnt = 1; + + STACK_WIND (frame, dht_err_cbk, + subvol, subvol->fops->fsync, + fd, datasync); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +int +dht_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct flock *flock) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, flock); + + return 0; +} + + +int +dht_lk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int cmd, struct flock *flock) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_lk_cbk, + subvol, subvol->fops->lk, + fd, cmd, flock); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + +/* gf_lk no longer exists +int +dht_gf_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct flock *flock) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, flock); + + return 0; +} + + +int +dht_gf_lk (call_frame_t *frame, xlator_t *this, + loc_t *loc, int cmd, struct flock *flock) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_gf_lk_cbk, + subvol, subvol->fops->gf_lk, + fd, cmd, flock); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} +*/ + +int +dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct statvfs *statvfs) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + goto unlock; + } + local->op_ret = 0; + + /* TODO: normalize sizes */ + local->statvfs.f_bsize = statvfs->f_bsize; + local->statvfs.f_frsize = statvfs->f_frsize; + + local->statvfs.f_blocks += statvfs->f_blocks; + local->statvfs.f_bfree += statvfs->f_bfree; + local->statvfs.f_bavail += statvfs->f_bavail; + local->statvfs.f_files += statvfs->f_files; + local->statvfs.f_ffree += statvfs->f_ffree; + local->statvfs.f_favail += statvfs->f_favail; + local->statvfs.f_fsid = statvfs->f_fsid; + local->statvfs.f_flag = statvfs->f_flag; + local->statvfs.f_namemax = statvfs->f_namemax; + + } +unlock: + UNLOCK (&frame->lock); + + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->statvfs); + + return 0; +} + + +int +dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + conf = this->private; + + local = dht_local_init (frame); + local->call_cnt = conf->subvolume_cnt; + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_statfs_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->statfs, loc); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->fd = fd_ref (fd); + ret = loc_dup (loc, &local->loc); + if (ret == -1) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->call_cnt = conf->subvolume_cnt; + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_fd_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->opendir, + loc, fd); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *orig_entries) +{ + dht_local_t *local = NULL; + gf_dirent_t entries; + gf_dirent_t *orig_entry = NULL; + gf_dirent_t *entry = NULL; + call_frame_t *prev = NULL; + xlator_t *subvol = NULL; + xlator_t *next = NULL; + dht_layout_t *layout = NULL; + int count = 0; + + + INIT_LIST_HEAD (&entries.list); + prev = cookie; + local = frame->local; + + if (op_ret < 0) + goto done; + + layout = dht_layout_get (this, local->fd->inode); + + list_for_each_entry (orig_entry, &orig_entries->list, list) { + subvol = dht_layout_search (this, layout, orig_entry->d_name); + + if (!subvol || subvol == prev->this) { + entry = gf_dirent_for_name (orig_entry->d_name); + if (!entry) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto unwind; + } + + dht_itransform (this, subvol, orig_entry->d_ino, + &entry->d_ino); + dht_itransform (this, subvol, orig_entry->d_off, + &entry->d_off); + + entry->d_type = orig_entry->d_type; + entry->d_len = orig_entry->d_len; + + list_add_tail (&entry->list, &entries.list); + count++; + } + } + op_ret = count; + +done: + if (count == 0) { + next = dht_subvol_next (this, prev->this); + if (!next) { + goto unwind; + } + + STACK_WIND (frame, dht_readdir_cbk, + next, next->fops->readdir, + local->fd, local->size, 0); + return 0; + } + +unwind: + if (op_ret < 0) + op_ret = 0; + + DHT_STACK_UNWIND (frame, op_ret, op_errno, &entries); + + gf_dirent_free (&entries); + + return 0; +} + + +int +dht_readdir (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t yoff) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + xlator_t *xvol = NULL; + off_t xoff = 0; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->fd = fd_ref (fd); + local->size = size; + + dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff); + + /* TODO: do proper readdir */ + STACK_WIND (frame, dht_readdir_cbk, + xvol, xvol->fops->readdir, + fd, size, xoff); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == -1) + local->op_errno = op_errno; + + if (op_ret == 0) + local->op_ret = 0; + } + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->fd = fd_ref (fd); + local->call_cnt = conf->subvolume_cnt; + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_fsyncdir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->fsyncdir, + fd, datasync); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +int +dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = -1; + + + if (op_ret == -1) + goto out; + + prev = cookie; + + dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino); + layout = dht_layout_for_subvol (this, prev->this); + + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no pre-set layout for subvolume %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + ret = inode_ctx_put (inode, this, (uint64_t)(long)layout); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not set inode context"); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + +out: + DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf); + return 0; +} + + +int +dht_mknod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, dev_t rdev) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + + subvol = dht_subvol_get_hashed (this, loc); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + gf_log (this->name, GF_LOG_DEBUG, + "creating %s on %s", loc->path, subvol->name); + + STACK_WIND (frame, dht_newfile_cbk, + subvol, subvol->fops->mknod, + loc, mode, rdev); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int +dht_symlink (call_frame_t *frame, xlator_t *this, + const char *linkname, loc_t *loc) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + + subvol = dht_subvol_get_hashed (this, loc); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + gf_log (this->name, GF_LOG_DEBUG, + "creating %s on %s", loc->path, subvol->name); + + STACK_WIND (frame, dht_newfile_cbk, + subvol, subvol->fops->symlink, + linkname, loc); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int +dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + xlator_t *cached_subvol = NULL; + xlator_t *hashed_subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + + cached_subvol = dht_subvol_get_cached (this, loc->inode); + if (!cached_subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + hashed_subvol = dht_subvol_get_hashed (this, loc); + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->call_cnt = 1; + if (hashed_subvol != cached_subvol) + local->call_cnt++; + + STACK_WIND (frame, dht_err_cbk, + cached_subvol, cached_subvol->fops->unlink, loc); + + if (hashed_subvol != cached_subvol) + STACK_WIND (frame, dht_err_cbk, + hashed_subvol, hashed_subvol->fops->unlink, loc); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +int +dht_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + + prev = cookie; + local = frame->local; + + if (op_ret == -1) + goto out; + + layout = dht_layout_for_subvol (this, prev->this); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no pre-set layout for subvolume %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + stbuf->st_ino = local->loc.inode->ino; + +out: + DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf); + + return 0; +} + + +int +dht_link_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + xlator_t *srcvol = NULL; + + + if (op_ret == -1) + goto err; + + local = frame->local; + srcvol = local->linkfile.srcvol; + + STACK_WIND (frame, dht_link_cbk, + srcvol, srcvol->fops->link, + &local->loc, &local->loc2); + + return 0; + +err: + DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf); + + return 0; +} + + +int +dht_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc) +{ + xlator_t *cached_subvol = NULL; + xlator_t *hashed_subvol = NULL; + int op_errno = -1; + int ret = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (oldloc, err); + VALIDATE_OR_GOTO (newloc, err); + + cached_subvol = dht_subvol_get_cached (this, oldloc->inode); + if (!cached_subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", oldloc->path); + op_errno = EINVAL; + goto err; + } + + hashed_subvol = dht_subvol_get_hashed (this, newloc); + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + newloc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + ret = loc_copy (&local->loc, oldloc); + if (ret == -1) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + ret = loc_copy (&local->loc2, newloc); + if (ret == -1) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + if (hashed_subvol != cached_subvol) { + dht_linkfile_create (frame, dht_link_linkfile_cbk, + cached_subvol, hashed_subvol, newloc); + } else { + STACK_WIND (frame, dht_link_cbk, + cached_subvol, cached_subvol->fops->link, + oldloc, newloc); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int +dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + fd_t *fd, inode_t *inode, struct stat *stbuf) +{ + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = -1; + + + if (op_ret == -1) + goto out; + + prev = cookie; + + dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino); + layout = dht_layout_for_subvol (this, prev->this); + + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no pre-set layout for subvolume %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + ret = inode_ctx_put (inode, this, (uint64_t)(long)layout); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not set inode context"); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + +out: + DHT_STACK_UNWIND (frame, op_ret, op_errno, fd, inode, stbuf); + return 0; +} + + +int +dht_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, fd_t *fd) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + + subvol = dht_subvol_get_hashed (this, loc); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + gf_log (this->name, GF_LOG_DEBUG, + "creating %s on %s", loc->path, subvol->name); + + STACK_WIND (frame, dht_create_cbk, + subvol, subvol->fops->create, + loc, flags, mode, fd); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + +int +dht_mkdir_selfheal_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + + + local = frame->local; + layout = local->selfheal.layout; + + if (op_ret == 0) { + inode_ctx_put (local->inode, this, (uint64_t)(long)layout); + local->selfheal.layout = NULL; + local->stbuf.st_ino = local->st_ino; + } + + DHT_STACK_UNWIND (frame, op_ret, op_errno, + local->inode, &local->stbuf); + + return 0; +} + + +int +dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + int ret = -1; + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + + local = frame->local; + prev = cookie; + layout = local->layout; + + LOCK (&frame->lock); + { + ret = dht_layout_merge (this, layout, prev->this, + op_ret, op_errno, NULL); + + if (op_ret == -1) { + local->op_errno = op_errno; + goto unlock; + } + dht_stat_merge (this, &local->stbuf, stbuf, prev->this); + } +unlock: + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + local->layout = NULL; + dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk, + &local->loc, layout); + } + + return 0; +} + +int +dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + int ret = -1; + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *hashed_subvol = NULL; + + local = frame->local; + prev = cookie; + layout = local->layout; + conf = this->private; + hashed_subvol = local->hashed_subvol; + + ret = dht_layout_merge (this, layout, prev->this, + op_ret, op_errno, NULL); + + if (op_ret == -1) { + local->op_errno = op_errno; + goto err; + } + local->op_ret = 0; + + dht_stat_merge (this, &local->stbuf, stbuf, prev->this); + + local->st_ino = local->stbuf.st_ino; + + local->call_cnt = conf->subvolume_cnt - 1; + + if (local->call_cnt == 0) { + local->layout = NULL; + dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk, + &local->loc, layout); + } + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == hashed_subvol) + continue; + STACK_WIND (frame, dht_mkdir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->mkdir, + &local->loc, local->mode); + } + return 0; +err: + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + return 0; +} + +int +dht_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + int ret = -1; + xlator_t *hashed_subvol = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + hashed_subvol = dht_subvol_get_hashed (this, loc); + + if (hashed_subvol == NULL) { + gf_log (this->name, GF_LOG_ERROR, + "hashed subvol not found"); + op_errno = EINVAL; + goto err; + } + + local->hashed_subvol = hashed_subvol; + local->inode = inode_ref (loc->inode); + ret = loc_copy (&local->loc, loc); + local->mode = mode; + + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->layout = dht_layout_new (this, conf->subvolume_cnt); + if (!local->layout) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + STACK_WIND (frame, dht_mkdir_hashed_cbk, + hashed_subvol, + hashed_subvol->fops->mkdir, + loc, mode); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int +dht_rmdir_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno) +{ + dht_local_t *local = NULL; + + local = frame->local; + local->layout = NULL; + + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +dht_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno) +{ + uint64_t tmp_layout = 0; + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + local->op_ret = -1; + + if (op_errno != ENOENT) + local->need_selfheal = 1; + + gf_log (this->name, GF_LOG_ERROR, + "rmdir on %s for %s failed (%s)", + prev->this->name, local->loc.path, + strerror (op_errno)); + goto unlock; + } + } +unlock: + UNLOCK (&frame->lock); + + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + if (local->need_selfheal) { + inode_ctx_get (local->loc.inode, this, + &tmp_layout); + layout = (dht_layout_t *)(long)tmp_layout; + + /* TODO: neater interface needed below */ + local->stbuf.st_mode = local->loc.inode->st_mode; + + dht_selfheal_restore (frame, dht_rmdir_selfheal_cbk, + &local->loc, layout); + } else { + DHT_STACK_UNWIND (frame, local->op_ret, + local->op_errno); + } + } + + return 0; +} + + +int +dht_rmdir_do (call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int i = 0; + + conf = this->private; + local = frame->local; + + if (local->op_ret == -1) + goto err; + + local->call_cnt = conf->subvolume_cnt; + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_rmdir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->rmdir, + &local->loc); + } + + return 0; + +err: + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno); + return 0; +} + + +int +dht_rmdir_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries) +{ + dht_local_t *local = NULL; + int this_call_cnt = -1; + call_frame_t *prev = NULL; + + local = frame->local; + prev = cookie; + + if (op_ret > 2) { + gf_log (this->name, GF_LOG_DEBUG, + "readdir on %s for %s returned %d entries", + prev->this->name, local->loc.path, op_ret); + local->op_ret = -1; + local->op_errno = ENOTEMPTY; + } + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_rmdir_do (frame, this); + } + + return 0; +} + + +int +dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd) +{ + dht_local_t *local = NULL; + int this_call_cnt = -1; + call_frame_t *prev = NULL; + + + local = frame->local; + prev = cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "opendir on %s for %s failed (%s)", + prev->this->name, local->loc.path, + strerror (op_errno)); + goto err; + } + + STACK_WIND (frame, dht_rmdir_readdir_cbk, + prev->this, prev->this->fops->readdir, + local->fd, 4096, 0); + + return 0; + +err: + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_rmdir_do (frame, this); + } + + return 0; +} + + +int +dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + int i = -1; + int ret = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->call_cnt = conf->subvolume_cnt; + local->op_ret = 0; + + ret = loc_copy (&local->loc, loc); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->fd = fd_create (local->loc.inode, frame->root->pid); + if (!local->fd) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_rmdir_opendir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->opendir, + loc, local->fd); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +static int32_t +dht_xattrop_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, dict); + return 0; +} + +int32_t +dht_xattrop (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = 1; + + STACK_WIND (frame, + dht_xattrop_cbk, + subvol, subvol->fops->xattrop, + loc, flags, dict); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + +static int32_t +dht_fxattrop_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, dict); + return 0; +} + +int32_t +dht_fxattrop (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + dht_fxattrop_cbk, + subvol, subvol->fops->fxattrop, + fd, flags, dict); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +static int32_t +dht_inodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int32_t +dht_inodelk (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t cmd, struct flock *lock) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = 1; + + STACK_WIND (frame, + dht_inodelk_cbk, + subvol, subvol->fops->inodelk, + loc, cmd, lock); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +static int32_t +dht_finodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int32_t +dht_finodelk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t cmd, struct flock *lock) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + + STACK_WIND (frame, + dht_finodelk_cbk, + subvol, subvol->fops->finodelk, + fd, cmd, lock); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +static int32_t +dht_entrylk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +dht_entrylk (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = 1; + + STACK_WIND (frame, dht_entrylk_cbk, + subvol, subvol->fops->entrylk, + loc, basename, cmd, type); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + +static int32_t +dht_fentrylk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +dht_fentrylk (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_fentrylk_cbk, + subvol, subvol->fops->fentrylk, + fd, basename, cmd, type); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +int +dht_forget (xlator_t *this, inode_t *inode) +{ + uint64_t tmp_layout = 0; + dht_layout_t *layout = NULL; + + inode_ctx_get (inode, this, &tmp_layout); + + if (!layout) + return 0; + layout = (dht_layout_t *)(long)tmp_layout; + if (!layout->preset) + FREE (layout); + + return 0; +} + + + +static int +dht_init_subvolumes (xlator_t *this, dht_conf_t *conf) +{ + xlator_list_t *subvols = NULL; + int cnt = 0; + + + for (subvols = this->children; subvols; subvols = subvols->next) + cnt++; + + conf->subvolumes = CALLOC (cnt, sizeof (xlator_t *)); + if (!conf->subvolumes) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + return -1; + } + conf->subvolume_cnt = cnt; + + cnt = 0; + for (subvols = this->children; subvols; subvols = subvols->next) + conf->subvolumes[cnt++] = subvols->xlator; + + conf->subvolume_status = CALLOC (cnt, sizeof (char)); + if (!conf->subvolume_status) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + return -1; + } + + return 0; +} + + +int +dht_notify (xlator_t *this, int event, void *data, ...) +{ + xlator_t *subvol = NULL; + int cnt = -1; + int i = -1; + dht_conf_t *conf = NULL; + int ret = -1; + + + conf = this->private; + + switch (event) { + case GF_EVENT_CHILD_UP: + subvol = data; + + conf->gen++; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + cnt = i; + break; + } + } + + if (cnt == -1) { + gf_log (this->name, GF_LOG_ERROR, + "got GF_EVENT_CHILD_UP bad subvolume %s", + subvol->name); + break; + } + + LOCK (&conf->subvolume_lock); + { + conf->subvolume_status[cnt] = 1; + } + UNLOCK (&conf->subvolume_lock); + + break; + + case GF_EVENT_CHILD_DOWN: + subvol = data; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + cnt = i; + break; + } + } + + if (cnt == -1) { + gf_log (this->name, GF_LOG_ERROR, + "got GF_EVENT_CHILD_DOWN bad subvolume %s", + subvol->name); + break; + } + + LOCK (&conf->subvolume_lock); + { + conf->subvolume_status[cnt] = 0; + } + UNLOCK (&conf->subvolume_lock); + + break; + } + + ret = default_notify (this, event, data); + + return ret; +} + diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h new file mode 100644 index 000000000..17017381b --- /dev/null +++ b/xlators/cluster/dht/src/dht-common.h @@ -0,0 +1,212 @@ +/* + Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#ifndef _DHT_H +#define _DHT_H + + +typedef int (*dht_selfheal_dir_cbk_t) (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno); + + +struct dht_layout { + int cnt; + int preset; + int gen; + int type; + struct { + int err; /* 0 = normal + -1 = dir exists and no xattr + >0 = dir lookup failed with errno + */ + uint32_t start; + uint32_t stop; + xlator_t *xlator; + } list[0]; +}; +typedef struct dht_layout dht_layout_t; + + +struct dht_local { + int call_cnt; + loc_t loc; + loc_t loc2; + int op_ret; + int op_errno; + int layout_mismatch; + struct stat stbuf; + struct statvfs statvfs; + fd_t *fd; + inode_t *inode; + dict_t *xattr; + dict_t *xattr_req; + dht_layout_t *layout; + size_t size; + ino_t st_ino; + xlator_t *src_hashed, *src_cached; + xlator_t *dst_hashed, *dst_cached; + xlator_t *cached_subvol; + xlator_t *hashed_subvol; + char need_selfheal; + struct { + fop_mknod_cbk_t linkfile_cbk; + struct stat stbuf; + loc_t loc; + inode_t *inode; + dict_t *xattr; + xlator_t *srcvol; + } linkfile; + struct { + uint32_t hole_cnt; + uint32_t overlaps_cnt; + uint32_t missing; + uint32_t down; + uint32_t misc; + dht_selfheal_dir_cbk_t dir_cbk; + dht_layout_t *layout; + } selfheal; + + /* needed by nufa */ + int32_t flags; + mode_t mode; + dev_t rdev; +}; +typedef struct dht_local dht_local_t; + + +struct dht_conf { + gf_lock_t subvolume_lock; + int subvolume_cnt; + xlator_t **subvolumes; + xlator_t *local_volume; /* Needed by NUFA */ + char *subvolume_status; + dht_layout_t **file_layouts; + dht_layout_t **dir_layouts; + dht_layout_t *default_dir_layout; + gf_boolean_t search_unhashed; + int gen; +}; +typedef struct dht_conf dht_conf_t; + + +struct dht_disk_layout { + uint32_t cnt; + uint32_t type; + struct { + uint32_t start; + uint32_t stop; + } list[1]; +}; +typedef struct dht_disk_layout dht_disk_layout_t; + +#define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT) + +#define is_fs_root(loc) (strcmp (loc->path, "/") == 0) + +#define is_revalidate(loc) (inode_ctx_get (loc->inode, this, NULL) == 0) + +#define is_last_call(cnt) (cnt == 0) + +#define DHT_LINKFILE_MODE (S_ISVTX) +#define check_is_linkfile(i,s,x) ((s->st_mode & ~S_IFMT) == DHT_LINKFILE_MODE) + +#define check_is_dir(i,s,x) (S_ISDIR(s->st_mode)) + +#define layout_is_sane(layout) ((layout) && (layout->cnt > 0)) + +#define DHT_STACK_UNWIND(frame, params ...) do { \ + dht_local_t *__local = NULL; \ + __local = frame->local; \ + frame->local = NULL; \ + STACK_UNWIND (frame, params); \ + dht_local_wipe (__local); \ + } while (0) + +#define DHT_STACK_DESTROY(frame) do { \ + dht_local_t *__local = NULL; \ + __local = frame->local; \ + frame->local = NULL; \ + STACK_DESTROY (frame->root); \ + dht_local_wipe (__local); \ + } while (0) + +dht_layout_t *dht_layout_new (xlator_t *this, int cnt); +dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode); +dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol); +xlator_t *dht_layout_search (xlator_t *this, dht_layout_t *layout, + const char *name); +int dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout); +int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, + uint32_t *holes_p, uint32_t *overlaps_p, + uint32_t *missing_p, uint32_t *down_p, + uint32_t *misc_p); +int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, + xlator_t *subvol, loc_t *loc, dict_t *xattr); + +xlator_t *dht_linkfile_subvol (xlator_t *this, inode_t *inode, + struct stat *buf, dict_t *xattr); +int dht_linkfile_unlink (call_frame_t *frame, xlator_t *this, + xlator_t *subvol, loc_t *loc); + +int dht_layouts_init (xlator_t *this, dht_conf_t *conf); +int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, + int op_ret, int op_errno, dict_t *xattr); + +int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, + int pos, int32_t **disk_layout_p); +int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, + int pos, int32_t *disk_layout); + + +int dht_frame_return (call_frame_t *frame); + +int dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y); +int dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol, + uint64_t *x); + +void dht_local_wipe (dht_local_t *local); +dht_local_t *dht_local_init (call_frame_t *frame); +int dht_stat_merge (xlator_t *this, struct stat *to, struct stat *from, + xlator_t *subvol); + +xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc); +xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode); +xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev); +int dht_subvol_cnt (xlator_t *this, xlator_t *subvol); + +int dht_hash_compute (int type, const char *name, uint32_t *hash_p); + +int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, + xlator_t *tovol, xlator_t *fromvol, loc_t *loc); +int +dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, + loc_t *loc, dht_layout_t *layout); +int +dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, + loc_t *loc, dht_layout_t *layout); + +int dht_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc); +#endif /* _DHT_H */ diff --git a/xlators/cluster/dht/src/dht-hashfn-tea.c b/xlators/cluster/dht/src/dht-hashfn-tea.c new file mode 100644 index 000000000..8437b4955 --- /dev/null +++ b/xlators/cluster/dht/src/dht-hashfn-tea.c @@ -0,0 +1,146 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#include <stdint.h> +#include <stdio.h> +#include <string.h> + + +#define DELTA 0x9E3779B9 +#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */ +#define PARTROUNDS 6 /* 6 gets complete mixing */ + + +static int +tearound (int rounds, uint32_t *array, uint32_t *h0, uint32_t *h1) +{ + uint32_t sum = 0; + int n = 0; + uint32_t b0 = 0; + uint32_t b1 = 0; + + b0 = *h0; + b1 = *h1; + + n = rounds; + + do { + sum += DELTA; + b0 += ((b1 << 4) + array[0]) + ^ (b1 + sum) + ^ ((b1 >> 5) + array[1]); + b1 += ((b0 << 4) + array[2]) + ^ (b0 + sum) + ^ ((b0 >> 5) + array[3]); + } while (--n); + + *h0 += b0; + *h1 += b1; + + return 0; +} + + +uint32_t +__pad (int len) +{ + uint32_t pad = 0; + + pad = (uint32_t) len | ((uint32_t) len << 8); + pad |= pad << 16; + + return pad; +} + + +uint32_t +dht_hashfn_tea (const char *msg, int len) +{ + uint32_t h0 = 0x9464a485; + uint32_t h1 = 0x542e1a94; + uint32_t array[4]; + uint32_t pad = 0; + int i = 0; + int j = 0; + int full_quads = 0; + int full_words = 0; + int full_bytes = 0; + uint32_t *intmsg = NULL; + int word = 0; + + + intmsg = (uint32_t *) msg; + pad = __pad (len); + + full_bytes = len; + full_words = len / 4; + full_quads = len / 16; + + for (i = 0; i < full_quads; i++) { + for (j = 0; j < 4; j++) { + word = *intmsg; + array[j] = word; + intmsg++; + full_words--; + full_bytes -= 4; + } + tearound (PARTROUNDS, &array[0], &h0, &h1); + } + + if ((len % 16) == 0) { + goto done; + } + + for (j = 0; j < 4; j++) { + if (full_words) { + word = *intmsg; + array[j] = word; + intmsg++; + full_words--; + full_bytes -= 4; + } else { + array[j] = pad; + while (full_bytes) { + array[j] <<= 8; + array[j] |= msg[len - full_bytes]; + full_bytes--; + } + } + } + tearound (FULLROUNDS, &array[0], &h0, &h1); + +done: + return h0 ^ h1; +} + + +#if 0 +int +main (int argc, char *argv[]) +{ + int i = 0; + int hashval = 0; + + for (i = 1; i < argc; i++) { + hashval = tea (argv[i], strlen (argv[i])); + printf ("%s: %x\n", argv[i], hashval); + } +} +#endif diff --git a/xlators/cluster/dht/src/dht-hashfn.c b/xlators/cluster/dht/src/dht-hashfn.c new file mode 100644 index 000000000..9e321a43c --- /dev/null +++ b/xlators/cluster/dht/src/dht-hashfn.c @@ -0,0 +1,88 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include "glusterfs.h" +#include "xlator.h" +#include "dht-common.h" + + +uint32_t dht_hashfn_tea (const char *name, int len); + + +typedef enum { + DHT_HASH_TYPE_TEA, +} dht_hashfn_type_t; + + +int +dht_hash_compute_internal (int type, const char *name, uint32_t *hash_p) +{ + int ret = 0; + uint32_t hash = 0; + + switch (type) { + case DHT_HASH_TYPE_TEA: + hash = dht_hashfn_tea (name, strlen (name)); + break; + default: + ret = -1; + break; + } + + if (ret == 0) { + *hash_p = hash; + } + + return ret; +} + + +#define MAKE_RSYNC_FRIENDLY_NAME(rsync_frndly_name, name) do { \ + rsync_frndly_name = (char *) name; \ + if (name[0] == '.') { \ + char *dot = 0; \ + int namelen = 0; \ + \ + dot = strrchr (name, '.'); \ + if (dot && dot > (name + 1) && *(dot + 1)) { \ + namelen = (dot - name); \ + rsync_frndly_name = alloca (namelen); \ + strncpy (rsync_frndly_name, name + 1, \ + namelen); \ + rsync_frndly_name[namelen - 1] = 0; \ + } \ + } \ + } while (0); + + +int +dht_hash_compute (int type, const char *name, uint32_t *hash_p) +{ + char *rsync_friendly_name = NULL; + + MAKE_RSYNC_FRIENDLY_NAME (rsync_friendly_name, name); + + return dht_hash_compute_internal (type, rsync_friendly_name, hash_p); +} diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c new file mode 100644 index 000000000..52d072002 --- /dev/null +++ b/xlators/cluster/dht/src/dht-helper.c @@ -0,0 +1,326 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include "glusterfs.h" +#include "xlator.h" +#include "dht-common.h" + + +int +dht_frame_return (call_frame_t *frame) +{ + dht_local_t *local = NULL; + int this_call_cnt = -1; + + if (!frame) + return -1; + + local = frame->local; + + LOCK (&frame->lock); + { + this_call_cnt = --local->call_cnt; + } + UNLOCK (&frame->lock); + + return this_call_cnt; +} + + +int +dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) +{ + dht_conf_t *conf = NULL; + int cnt = 0; + int max = 0; + uint64_t y = 0; + + + if (x == ((uint64_t) -1)) { + y = (uint64_t) -1; + goto out; + } + + conf = this->private; + + max = conf->subvolume_cnt; + cnt = dht_subvol_cnt (this, subvol); + + y = ((x * max) + cnt); + +out: + if (y_p) + *y_p = y; + + return 0; +} + + +int +dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p, + uint64_t *x_p) +{ + dht_conf_t *conf = NULL; + int cnt = 0; + int max = 0; + uint64_t x = 0; + xlator_t *subvol = 0; + + + conf = this->private; + max = conf->subvolume_cnt; + + cnt = y % max; + x = y / max; + + subvol = conf->subvolumes[cnt]; + + if (subvol_p) + *subvol_p = subvol; + + if (x_p) + *x_p = x; + + return 0; +} + + +void +dht_local_wipe (dht_local_t *local) +{ + if (!local) + return; + + loc_wipe (&local->loc); + loc_wipe (&local->loc2); + + if (local->xattr) + dict_unref (local->xattr); + + if (local->inode) + inode_unref (local->inode); + + if (local->layout) + FREE (local->layout); + + loc_wipe (&local->linkfile.loc); + + if (local->linkfile.xattr) + dict_unref (local->linkfile.xattr); + + if (local->linkfile.inode) + inode_unref (local->linkfile.inode); + + if (local->fd) { + fd_unref (local->fd); + local->fd = NULL; + } + + if (local->xattr_req) + dict_unref (local->xattr_req); + + FREE (local); +} + + +dht_local_t * +dht_local_init (call_frame_t *frame) +{ + dht_local_t *local = NULL; + + /* TODO: use mem-pool */ + local = CALLOC (1, sizeof (*local)); + + if (!local) + return NULL; + + local->op_ret = -1; + local->op_errno = EUCLEAN; + + frame->local = local; + + return local; +} + + +char * +basestr (const char *str) +{ + char *basestr = NULL; + + basestr = strrchr (str, '/'); + if (basestr) + basestr ++; + + return basestr; +} + +xlator_t * +dht_first_up_child (xlator_t *this) +{ + dht_conf_t *conf = NULL; + xlator_t *child = NULL; + int i = 0; + + conf = this->private; + + LOCK (&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolume_status[i]) { + child = conf->subvolumes[i]; + break; + } + } + } + UNLOCK (&conf->subvolume_lock); + + return child; +} + +xlator_t * +dht_subvol_get_hashed (xlator_t *this, loc_t *loc) +{ + dht_layout_t *layout = NULL; + xlator_t *subvol = NULL; + + if (is_fs_root (loc)) { + subvol = dht_first_up_child (this); + goto out; + } + + layout = dht_layout_get (this, loc->parent); + + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "layout missing path=%s parent=%"PRId64, + loc->path, loc->parent->ino); + goto out; + } + + subvol = dht_layout_search (this, layout, loc->name); + + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "could not find subvolume for path=%s", + loc->path); + goto out; + } + +out: + return subvol; +} + + +xlator_t * +dht_subvol_get_cached (xlator_t *this, inode_t *inode) +{ + dht_layout_t *layout = NULL; + xlator_t *subvol = NULL; + + + layout = dht_layout_get (this, inode); + + if (!layout) { + goto out; + } + + subvol = layout->list[0].xlator; + +out: + return subvol; +} + + +xlator_t * +dht_subvol_next (xlator_t *this, xlator_t *prev) +{ + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *next = NULL; + + conf = this->private; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev) { + if ((i + 1) < conf->subvolume_cnt) + next = conf->subvolumes[i + 1]; + break; + } + } + + return next; +} + + +int +dht_subvol_cnt (xlator_t *this, xlator_t *subvol) +{ + int i = 0; + int ret = -1; + dht_conf_t *conf = NULL; + + + conf = this->private; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + ret = i; + break; + } + } + + return ret; +} + + +#define set_if_greater(a, b) do { \ + if ((a) < (b)) \ + (a) = (b); \ + } while (0) + +int +dht_stat_merge (xlator_t *this, struct stat *to, + struct stat *from, xlator_t *subvol) +{ + to->st_dev = from->st_dev; + + dht_itransform (this, subvol, from->st_ino, &to->st_ino); + + to->st_mode = from->st_mode; + to->st_nlink = from->st_nlink; + to->st_uid = from->st_uid; + to->st_gid = from->st_gid; + to->st_rdev = from->st_rdev; + to->st_size += from->st_size; + to->st_blksize = from->st_blksize; + to->st_blocks += from->st_blocks; + + set_if_greater (to->st_atime, from->st_atime); + set_if_greater (to->st_mtime, from->st_mtime); + set_if_greater (to->st_ctime, from->st_ctime); + + return 0; +} diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c new file mode 100644 index 000000000..08b4a2746 --- /dev/null +++ b/xlators/cluster/dht/src/dht-layout.c @@ -0,0 +1,543 @@ +/* + Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include "glusterfs.h" +#include "xlator.h" +#include "dht-common.h" +#include "byte-order.h" + +#define layout_base_size (sizeof (dht_layout_t)) + +#define layout_entry_size (sizeof ((dht_layout_t *)NULL)->list[0]) + +#define layout_size(cnt) (layout_base_size + (cnt * layout_entry_size)) + + +dht_layout_t * +dht_layout_new (xlator_t *this, int cnt) +{ + dht_layout_t *layout = NULL; + + + layout = CALLOC (1, layout_size (cnt)); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto out; + } + + layout->cnt = cnt; + +out: + return layout; +} + + +dht_layout_t * +dht_layout_get (xlator_t *this, inode_t *inode) +{ + uint64_t layout = 0; + int ret = -1; + + ret = inode_ctx_get (inode, this, &layout); + + return (dht_layout_t *)(long)layout; +} + + +xlator_t * +dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name) +{ + uint32_t hash = 0; + xlator_t *subvol = NULL; + int i = 0; + int ret = 0; + + + ret = dht_hash_compute (layout->type, name, &hash); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "hash computation failed for type=%d name=%s", + layout->type, name); + goto out; + } + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].start <= hash + && layout->list[i].stop >= hash) { + subvol = layout->list[i].xlator; + break; + } + } + + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no subvolume for hash (value) = %u", hash); + } + +out: + return subvol; +} + + +dht_layout_t * +dht_layout_for_subvol (xlator_t *this, xlator_t *subvol) +{ + dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + int i = 0; + + + conf = this->private; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == subvol) { + layout = conf->file_layouts[i]; + break; + } + } + + return layout; +} + + +int +dht_layouts_init (xlator_t *this, dht_conf_t *conf) +{ + dht_layout_t *layout = NULL; + int i = 0; + int ret = -1; + + + conf->file_layouts = CALLOC (conf->subvolume_cnt, + sizeof (dht_layout_t *)); + if (!conf->file_layouts) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto out; + } + + for (i = 0; i < conf->subvolume_cnt; i++) { + layout = dht_layout_new (this, 1); + + if (!layout) { + goto out; + } + + layout->preset = 1; + + layout->list[0].xlator = conf->subvolumes[i]; + + conf->file_layouts[i] = layout; + } + + ret = 0; +out: + return ret; +} + + +int +dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, + int pos, int32_t **disk_layout_p) +{ + int ret = -1; + int32_t *disk_layout = NULL; + + disk_layout = CALLOC (5, sizeof (int)); + if (!disk_layout) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto out; + } + + disk_layout[0] = hton32 (1); + disk_layout[1] = hton32 (layout->type); + disk_layout[2] = hton32 (layout->list[pos].start); + disk_layout[3] = hton32 (layout->list[pos].stop); + + if (disk_layout_p) + *disk_layout_p = disk_layout; + ret = 0; + +out: + return ret; +} + + +int +dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, + int pos, int32_t *disk_layout) +{ + int cnt = 0; + int type = 0; + int start_off = 0; + int stop_off = 0; + + + /* TODO: assert disk_layout_ptr is of required length */ + + cnt = ntoh32 (disk_layout[0]); + if (cnt != 1) { + gf_log (this->name, GF_LOG_ERROR, + "disk layout has invalid count %d", cnt); + return -1; + } + + /* TODO: assert type is compatible */ + type = ntoh32 (disk_layout[1]); + start_off = ntoh32 (disk_layout[2]); + stop_off = ntoh32 (disk_layout[3]); + + layout->list[pos].start = start_off; + layout->list[pos].stop = stop_off; + + gf_log (this->name, GF_LOG_DEBUG, + "merged to layout: %u - %u (type %d) from %s", + start_off, stop_off, type, + layout->list[pos].xlator->name); + + return 0; +} + + +int +dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, + int op_ret, int op_errno, dict_t *xattr) +{ + int i = 0; + int ret = -1; + int err = -1; + int32_t *disk_layout = NULL; + + + if (op_ret != 0) { + err = op_errno; + } + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == NULL) { + layout->list[i].err = err; + layout->list[i].xlator = subvol; + break; + } + } + + if (op_ret != 0) { + ret = 0; + goto out; + } + + if (xattr) { + /* during lookup and not mkdir */ + ret = dict_get_ptr (xattr, "trusted.glusterfs.dht", + VOID(&disk_layout)); + } + + if (ret != 0) { + layout->list[i].err = -1; + gf_log (this->name, GF_LOG_DEBUG, + "missing disk layout on %s. err = %d", + subvol->name, err); + ret = 0; + goto out; + } + + ret = dht_disk_layout_merge (this, layout, i, disk_layout); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "layout merge from subvolume %s failed", + subvol->name); + goto out; + } + layout->list[i].err = 0; + +out: + return ret; +} + + +void +dht_layout_entry_swap (dht_layout_t *layout, int i, int j) +{ + uint32_t start_swap = 0; + uint32_t stop_swap = 0; + xlator_t *xlator_swap = 0; + int err_swap = 0; + + + start_swap = layout->list[i].start; + stop_swap = layout->list[i].stop; + xlator_swap = layout->list[i].xlator; + err_swap = layout->list[i].err; + + layout->list[i].start = layout->list[j].start; + layout->list[i].stop = layout->list[j].stop; + layout->list[i].xlator = layout->list[j].xlator; + layout->list[i].err = layout->list[j].err; + + layout->list[j].start = start_swap; + layout->list[j].stop = stop_swap; + layout->list[j].xlator = xlator_swap; + layout->list[j].err = err_swap; +} + + +int64_t +dht_layout_entry_cmp (dht_layout_t *layout, int i, int j) +{ + int64_t diff = 0; + + if (layout->list[i].err || layout->list[j].err) + diff = layout->list[i].err - layout->list[j].err; + else + diff = (int64_t) layout->list[i].start + - (int64_t) layout->list[j].start; + + return diff; +} + + +int +dht_layout_sort (dht_layout_t *layout) +{ + int i = 0; + int j = 0; + int64_t ret = 0; + + /* TODO: O(n^2) -- bad bad */ + + for (i = 0; i < layout->cnt - 1; i++) { + for (j = i + 1; j < layout->cnt; j++) { + ret = dht_layout_entry_cmp (layout, i, j); + if (ret > 0) + dht_layout_entry_swap (layout, i, j); + } + } + + return 0; +} + + +int +dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, + uint32_t *holes_p, uint32_t *overlaps_p, + uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p) +{ + dht_conf_t *conf = NULL; + uint32_t holes = 0; + uint32_t overlaps = 0; + uint32_t missing = 0; + uint32_t down = 0; + uint32_t misc = 0; + uint32_t hole_cnt = 0; + uint32_t overlap_cnt = 0; + int i = 0; + int ret = 0; + uint32_t prev_stop = 0; + uint32_t last_stop = 0; + char is_virgin = 1; + + + conf = this->private; + + /* TODO: explain WTF is happening */ + + last_stop = layout->list[0].start - 1; + prev_stop = last_stop; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err) { + switch (layout->list[i].err) { + case -1: + case ENOENT: + missing++; + break; + case ENOTCONN: + down++; + break; + default: + misc++; + } + continue; + } + + is_virgin = 0; + + if ((prev_stop + 1) < layout->list[i].start) { + hole_cnt++; + holes += (layout->list[i].start - (prev_stop + 1)); + } + + if ((prev_stop + 1) > layout->list[i].start) { + overlap_cnt++; + overlaps += ((prev_stop + 1) - layout->list[i].start); + } + prev_stop = layout->list[i].stop; + } + + if ((last_stop - prev_stop) || is_virgin) + hole_cnt++; + holes += (last_stop - prev_stop); + + if (holes_p) + *holes_p = hole_cnt; + + if (overlaps_p) + *overlaps_p = overlap_cnt; + + if (missing_p) + *missing_p = missing; + + if (down_p) + *down_p = down; + + if (misc_p) + *misc_p = misc; + + return ret; +} + + +int +dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout) +{ + int ret = 0; + uint32_t holes = 0; + uint32_t overlaps = 0; + uint32_t missing = 0; + uint32_t down = 0; + uint32_t misc = 0; + + + ret = dht_layout_sort (layout); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "sort failed?! how the ...."); + goto out; + } + + ret = dht_layout_anomalies (this, loc, layout, + &holes, &overlaps, + &missing, &down, &misc); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "error while finding anomalies in %s -- not good news", + loc->path); + goto out; + } + + if (holes || overlaps) { + if (missing == layout->cnt) { + gf_log (this->name, GF_LOG_WARNING, + "directory %s looked up first time", + loc->path); + } else { + gf_log (this->name, GF_LOG_ERROR, + "found anomalies in %s. holes=%d overlaps=%d", + loc->path, holes, overlaps); + } + ret = 1; + } + +out: + return ret; +} + + +int +dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, + loc_t *loc, dict_t *xattr) +{ + int idx = 0; + int pos = -1; + int ret = -1; + int32_t *disk_layout = NULL; + int32_t count = -1; + uint32_t start_off = -1; + uint32_t stop_off = -1; + + + for (idx = 0; idx < layout->cnt; idx++) { + if (layout->list[idx].xlator == subvol) { + pos = idx; + break; + } + } + + if (pos == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "%s - no layout info for subvolume %s", + loc->path, subvol->name); + ret = 1; + goto out; + } + + if (xattr == NULL) { + gf_log (this->name, GF_LOG_ERROR, + "%s - xattr dictionary is NULL", + loc->path); + ret = -1; + goto out; + } + + ret = dict_get_ptr (xattr, "trusted.glusterfs.dht", + VOID(&disk_layout)); + + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s - disk layout missing", loc->path); + ret = -1; + goto out; + } + + count = ntoh32 (disk_layout[0]); + if (count != 1) { + gf_log (this->name, GF_LOG_ERROR, + "%s - disk layout has invalid count %d", + loc->path, count); + ret = -1; + goto out; + } + + start_off = ntoh32 (disk_layout[2]); + stop_off = ntoh32 (disk_layout[3]); + + if ((layout->list[pos].start != start_off) + || (layout->list[pos].stop != stop_off)) { + gf_log (this->name, GF_LOG_DEBUG, + "subvol: %s; inode layout - %"PRId32" - %"PRId32"; " + "disk layout - %"PRId32" - %"PRId32, + layout->list[pos].xlator->name, + layout->list[pos].start, layout->list[pos].stop, + start_off, stop_off); + ret = 1; + } else { + ret = 0; + } +out: + return ret; +} + diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c new file mode 100644 index 000000000..9cc24ccf6 --- /dev/null +++ b/xlators/cluster/dht/src/dht-linkfile.c @@ -0,0 +1,224 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include "glusterfs.h" +#include "xlator.h" +#include "compat.h" +#include "dht-common.h" + + + +int +dht_linkfile_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno) +{ + dht_local_t *local = NULL; + + + local = frame->local; + local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno, + local->linkfile.inode, + &local->linkfile.stbuf); + + return 0; +} + + +int +dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + dict_t *xattr = NULL; + data_t *str_data = NULL; + int ret = -1; + + local = frame->local; + prev = cookie; + + if (op_ret == -1) + goto err; + + xattr = get_new_dict (); + if (!xattr) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->linkfile.xattr = dict_ref (xattr); + local->linkfile.inode = inode_ref (inode); + + str_data = str_to_data (local->linkfile.srcvol->name); + if (!str_data) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + ret = dict_set (xattr, "trusted.glusterfs.dht.linkto", str_data); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to initialize linkfile data"); + op_errno = EINVAL; + } + str_data = NULL; + + local->linkfile.stbuf = *stbuf; + + STACK_WIND (frame, dht_linkfile_xattr_cbk, + prev->this, prev->this->fops->setxattr, + &local->linkfile.loc, local->linkfile.xattr, 0); + + return 0; + +err: + if (str_data) { + data_destroy (str_data); + str_data = NULL; + } + + local->linkfile.linkfile_cbk (frame, cookie, this, + op_ret, op_errno, inode, stbuf); + return 0; +} + + +int +dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, + xlator_t *tovol, xlator_t *fromvol, loc_t *loc) +{ + dht_local_t *local = NULL; + + + local = frame->local; + local->linkfile.linkfile_cbk = linkfile_cbk; + local->linkfile.srcvol = tovol; + loc_copy (&local->linkfile.loc, loc); + + STACK_WIND (frame, dht_linkfile_create_cbk, + fromvol, fromvol->fops->mknod, loc, + S_IFREG | DHT_LINKFILE_MODE, 0); + + return 0; +} + + +int +dht_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + prev = cookie; + subvol = prev->this; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "unlinking linkfile %s on %s failed (%s)", + local->loc.path, subvol->name, strerror (op_errno)); + } + + DHT_STACK_DESTROY (frame); + + return 0; +} + + +int +dht_linkfile_unlink (call_frame_t *frame, xlator_t *this, + xlator_t *subvol, loc_t *loc) +{ + call_frame_t *unlink_frame = NULL; + dht_local_t *unlink_local = NULL; + + unlink_frame = copy_frame (frame); + if (!unlink_frame) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + unlink_local = dht_local_init (unlink_frame); + if (!unlink_local) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + loc_copy (&unlink_local->loc, loc); + + STACK_WIND (unlink_frame, dht_linkfile_unlink_cbk, + subvol, subvol->fops->unlink, + &unlink_local->loc); + + return 0; +err: + if (unlink_frame) + DHT_STACK_DESTROY (unlink_frame); + + return -1; +} + + +xlator_t * +dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct stat *stbuf, + dict_t *xattr) +{ + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; + void *volname = NULL; + int i = 0, ret = 0; + + + conf = this->private; + + if (!xattr) + goto out; + + ret = dict_get_ptr (xattr, "trusted.glusterfs.dht.linkto", &volname); + + if ((-1 == ret) || !volname) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (strcmp (conf->subvolumes[i]->name, (char *)volname) == 0) { + subvol = conf->subvolumes[i]; + break; + } + } + +out: + return subvol; +} + + diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c new file mode 100644 index 000000000..e5532f1bc --- /dev/null +++ b/xlators/cluster/dht/src/dht-rename.c @@ -0,0 +1,562 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +/* TODO: link(oldpath, newpath) fails if newpath already exists. DHT should + * delete the newpath if it gets EEXISTS from link() call. + */ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "xlator.h" +#include "dht-common.h" +#include "defaults.h" + + +int +dht_rename_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *stbuf) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + + + local = frame->local; + prev = cookie; + + if (op_ret == -1) { + /* TODO: undo the damage */ + + gf_log (this->name, GF_LOG_ERROR, + "rename %s -> %s on %s failed (%s)", + local->loc.path, local->loc2.path, + prev->this->name, strerror (op_errno)); + + local->op_ret = op_ret; + local->op_errno = op_errno; + } else { + /* TODO: construct proper stbuf for dir */ + local->stbuf = *stbuf; + } + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + } + + return 0; +} + + + +int +dht_rename_dir_do (call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int i = 0; + + conf = this->private; + local = frame->local; + + if (local->op_ret == -1) + goto err; + + local->call_cnt = conf->subvolume_cnt; + local->op_ret = 0; + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_rename_dir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->rename, + &local->loc, &local->loc2); + } + + return 0; + +err: + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno); + return 0; +} + + +int +dht_rename_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries) +{ + dht_local_t *local = NULL; + int this_call_cnt = -1; + call_frame_t *prev = NULL; + + local = frame->local; + prev = cookie; + + if (op_ret > 2) { + gf_log (this->name, GF_LOG_DEBUG, + "readdir on %s for %s returned %d entries", + prev->this->name, local->loc.path, op_ret); + local->op_ret = -1; + local->op_errno = ENOTEMPTY; + } + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_rename_dir_do (frame, this); + } + + return 0; +} + + +int +dht_rename_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd) +{ + dht_local_t *local = NULL; + int this_call_cnt = -1; + call_frame_t *prev = NULL; + + + local = frame->local; + prev = cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "opendir on %s for %s failed (%s)", + prev->this->name, local->loc.path, + strerror (op_errno)); + goto err; + } + + STACK_WIND (frame, dht_rename_readdir_cbk, + prev->this, prev->this->fops->readdir, + local->fd, 4096, 0); + + return 0; + +err: + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_rename_dir_do (frame, this); + } + + return 0; +} + + +int +dht_rename_dir (call_frame_t *frame, xlator_t *this) +{ + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int i = 0; + int op_errno = -1; + + + conf = frame->this->private; + local = frame->local; + + local->call_cnt = conf->subvolume_cnt; + + local->fd = fd_create (local->loc.inode, frame->root->pid); + if (!local->fd) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->op_ret = 0; + + if (!local->dst_cached) { + dht_rename_dir_do (frame, this); + return 0; + } + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_rename_opendir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->opendir, + &local->loc2, local->fd); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + + +int +dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int this_call_cnt = 0; + + local = frame->local; + prev = cookie; + + this_call_cnt = dht_frame_return (frame); + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "unlink on %s failed (%s)", + prev->this->name, strerror (op_errno)); + } + + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + + return 0; +} + + +int +dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *stbuf) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + xlator_t *src_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *dst_cached = NULL; + xlator_t *rename_subvol = NULL; + + local = frame->local; + prev = cookie; + + src_hashed = local->src_hashed; + src_cached = local->src_cached; + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "rename on %s failed (%s)", prev->this->name, + strerror (op_errno)); + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; + } + + /* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk + * is called. since rename has already happened on rename_subvol, + * unlink should not be sent for oldpath (either linkfile or cached-file) + * on rename_subvol. */ + if (src_cached == dst_cached) + rename_subvol = src_cached; + else + rename_subvol = dst_hashed; + + /* TODO: delete files in background */ + + if (src_cached != dst_hashed && src_cached != dst_cached) + local->call_cnt++; + + if (src_hashed != rename_subvol && src_hashed != src_cached) + local->call_cnt++; + + if (dst_cached && dst_cached != dst_hashed && dst_cached != src_cached) + local->call_cnt++; + + if (local->call_cnt == 0) + goto unwind; + + if (src_cached != dst_hashed && src_cached != dst_cached) { + gf_log (this->name, GF_LOG_DEBUG, + "deleting old src datafile %s @ %s", + local->loc.path, src_cached->name); + + STACK_WIND (frame, dht_rename_unlink_cbk, + src_cached, src_cached->fops->unlink, + &local->loc); + } + + if (src_hashed != rename_subvol && src_hashed != src_cached) { + gf_log (this->name, GF_LOG_DEBUG, + "deleting old src linkfile %s @ %s", + local->loc.path, src_hashed->name); + + STACK_WIND (frame, dht_rename_unlink_cbk, + src_hashed, src_hashed->fops->unlink, + &local->loc); + } + + if (dst_cached + && (dst_cached != dst_hashed) + && (dst_cached != src_cached)) { + gf_log (this->name, GF_LOG_DEBUG, + "deleting old dst datafile %s @ %s", + local->loc2.path, dst_cached->name); + + STACK_WIND (frame, dht_rename_unlink_cbk, + dst_cached, dst_cached->fops->unlink, + &local->loc2); + } + return 0; + +unwind: + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + + return 0; +} + + +int +dht_do_rename (call_frame_t *frame) +{ + dht_local_t *local = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_cached = NULL; + xlator_t *this = NULL; + xlator_t *rename_subvol = NULL; + + + local = frame->local; + this = frame->this; + + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; + src_cached = local->src_cached; + + if (src_cached == dst_cached) + rename_subvol = src_cached; + else + rename_subvol = dst_hashed; + + gf_log (this->name, GF_LOG_DEBUG, + "renaming %s => %s (%s)", + local->loc.path, local->loc2.path, rename_subvol->name); + + STACK_WIND (frame, dht_rename_cbk, + rename_subvol, rename_subvol->fops->rename, + &local->loc, &local->loc2); + + return 0; +} + + +int +dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int this_call_cnt = 0; + + + local = frame->local; + prev = cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "link/file on %s failed (%s)", + prev->this->name, strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + } + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + if (local->op_ret == -1) + goto unwind; + + dht_do_rename (frame); + } + + return 0; + +unwind: + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + + return 0; +} + + +int +dht_rename_create_links (call_frame_t *frame) +{ + dht_local_t *local = NULL; + xlator_t *this = NULL; + xlator_t *src_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *dst_cached = NULL; + int call_cnt = 0; + + + local = frame->local; + this = frame->this; + + src_hashed = local->src_hashed; + src_cached = local->src_cached; + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; + + if (src_cached == dst_cached) + goto nolinks; + + if (dst_hashed != src_hashed && dst_hashed != src_cached) + call_cnt++; + + if (src_cached != dst_hashed) + call_cnt++; + + local->call_cnt = call_cnt; + + if (dst_hashed != src_hashed && dst_hashed != src_cached) { + gf_log (this->name, GF_LOG_DEBUG, + "linkfile %s @ %s => %s", + local->loc.path, dst_hashed->name, src_cached->name); + dht_linkfile_create (frame, dht_rename_links_cbk, + src_cached, dst_hashed, &local->loc); + } + + if (src_cached != dst_hashed) { + gf_log (this->name, GF_LOG_DEBUG, + "link %s => %s (%s)", local->loc.path, + local->loc2.path, src_cached->name); + STACK_WIND (frame, dht_rename_links_cbk, + src_cached, src_cached->fops->link, + &local->loc, &local->loc2); + } + +nolinks: + if (!call_cnt) { + /* skip to next step */ + dht_do_rename (frame); + } + + return 0; +} + + +int +dht_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc) +{ + xlator_t *src_cached = NULL; + xlator_t *src_hashed = NULL; + xlator_t *dst_cached = NULL; + xlator_t *dst_hashed = NULL; + int op_errno = -1; + int ret = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (oldloc, err); + VALIDATE_OR_GOTO (newloc, err); + + src_hashed = dht_subvol_get_hashed (this, oldloc); + if (!src_hashed) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + oldloc->path); + op_errno = EINVAL; + goto err; + } + + src_cached = dht_subvol_get_cached (this, oldloc->inode); + if (!src_cached) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", oldloc->path); + op_errno = EINVAL; + goto err; + } + + dst_hashed = dht_subvol_get_hashed (this, newloc); + if (!dst_hashed) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + newloc->path); + op_errno = EINVAL; + goto err; + } + + if (newloc->inode) + dst_cached = dht_subvol_get_cached (this, newloc->inode); + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + ret = loc_copy (&local->loc, oldloc); + if (ret == -1) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + ret = loc_copy (&local->loc2, newloc); + if (ret == -1) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->src_hashed = src_hashed; + local->src_cached = src_cached; + local->dst_hashed = dst_hashed; + local->dst_cached = dst_cached; + + gf_log (this->name, GF_LOG_DEBUG, + "renaming %s (hash=%s/cache=%s) => %s (hash=%s/cache=%s)", + oldloc->path, src_hashed->name, src_cached->name, + newloc->path, dst_hashed->name, + dst_cached ? dst_cached->name : "<nul>"); + + if (S_ISDIR (oldloc->inode->st_mode)) { + dht_rename_dir (frame, this); + } else { + local->op_ret = 0; + dht_rename_create_links (frame); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c new file mode 100644 index 000000000..ee32b2253 --- /dev/null +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -0,0 +1,460 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include "glusterfs.h" +#include "xlator.h" +#include "dht-common.h" + + +int +dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret) +{ + dht_local_t *local = NULL; + + + local = frame->local; + local->selfheal.dir_cbk (frame, NULL, frame->this, ret, + local->op_errno); + + return 0; +} + + +int +dht_selfheal_dir_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + xlator_t *subvol = NULL; + int i = 0; + dht_layout_t *layout = NULL; + int err = 0; + int this_call_cnt = 0; + + local = frame->local; + layout = local->selfheal.layout; + prev = cookie; + subvol = prev->this; + + if (op_ret == 0) + err = 0; + else + err = op_errno; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == subvol) { + layout->list[i].err = err; + break; + } + } + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_selfheal_dir_finish (frame, this, 0); + } + + return 0; +} + + +int +dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, + dht_layout_t *layout, int i) +{ + xlator_t *subvol = NULL; + dict_t *xattr = NULL; + int ret = 0; + xlator_t *this = NULL; + int32_t *disk_layout = NULL; + + + subvol = layout->list[i].xlator; + this = frame->this; + + xattr = get_new_dict (); + if (!xattr) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + ret = dht_disk_layout_extract (this, layout, i, &disk_layout); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "failed to extract disk layout"); + goto err; + } + + ret = dict_set_bin (xattr, "trusted.glusterfs.dht", + disk_layout, 4 * 4); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set xattr dictionary"); + goto err; + } + disk_layout = NULL; + + gf_log (this->name, GF_LOG_DEBUG, + "setting hash range %u - %u (type %d) on subvolume %s for %s", + layout->list[i].start, layout->list[i].stop, + layout->type, subvol->name, loc->path); + + dict_ref (xattr); + + STACK_WIND (frame, dht_selfheal_dir_xattr_cbk, + subvol, subvol->fops->setxattr, + loc, xattr, 0); + + dict_unref (xattr); + + return 0; + +err: + if (xattr) + dict_destroy (xattr); + + if (disk_layout) + FREE (disk_layout); + + dht_selfheal_dir_xattr_cbk (frame, subvol, frame->this, + -1, ENOMEM); + return 0; +} + + +int +dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) +{ + dht_local_t *local = NULL; + int missing_xattr = 0; + int i = 0; + int ret = 0; + xlator_t *this = NULL; + + local = frame->local; + this = frame->this; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err != -1 || !layout->list[i].stop) + continue; + /* attr missing and layout present */ + missing_xattr++; + } + + gf_log (this->name, GF_LOG_DEBUG, + "%d subvolumes missing xattr for %s", + missing_xattr, loc->path); + + if (missing_xattr == 0) { + dht_selfheal_dir_finish (frame, this, 0); + return 0; + } + + local->call_cnt = missing_xattr; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err != -1 || !layout->list[i].stop) + continue; + + ret = dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i); + + if (--missing_xattr == 0) + break; + } + return 0; +} + + +int +dht_selfheal_dir_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + call_frame_t *prev = NULL; + xlator_t *subvol = NULL; + int i = 0; + int this_call_cnt = 0; + + + local = frame->local; + layout = local->selfheal.layout; + prev = cookie; + subvol = prev->this; + + if ((op_ret == 0) || (op_errno == EEXIST)) { + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == subvol) { + layout->list[i].err = -1; + break; + } + } + } + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_selfheal_dir_xattr (frame, &local->loc, layout); + } + + return 0; +} + + +int +dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc, + dht_layout_t *layout, int force) +{ + int missing_dirs = 0; + int i = 0; + dht_local_t *local = NULL; + xlator_t *this = NULL; + + + local = frame->local; + this = frame->this; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err == ENOENT || force) + missing_dirs++; + } + + if (missing_dirs == 0) { + dht_selfheal_dir_xattr (frame, loc, layout); + return 0; + } + + local->call_cnt = missing_dirs; + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err == ENOENT || force) { + gf_log (this->name, GF_LOG_DEBUG, + "creating directory %s on subvol %s", + loc->path, layout->list[i].xlator->name); + + STACK_WIND (frame, dht_selfheal_dir_mkdir_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->mkdir, + loc, local->stbuf.st_mode); + } + } + + return 0; +} + +void +dht_selfheal_fix_this_virgin (call_frame_t *frame, loc_t *loc, + dht_layout_t *layout) +{ + dht_conf_t *conf = NULL; + xlator_t *this = NULL; + uint32_t chunk = 0; + int i = 0; + uint32_t start = 0; + int cnt = 0; + int err = 0; + + this = frame->this; + conf = this->private; + + for (i = 0; i < layout->cnt; i++) { + err = layout->list[i].err; + if (err == -1) { + cnt++; + } + } + + chunk = ((unsigned long) 0xffffffff) / cnt; + + start = 0; + for (i = 0; i < layout->cnt; i++) { + err = layout->list[i].err; + if (err == -1) { + layout->list[i].start = start; + layout->list[i].stop = start + chunk - 1; + + start = start + chunk; + + gf_log (this->name, GF_LOG_DEBUG, + "gave fix: %u - %u on %s for %s", + layout->list[i].start, layout->list[i].stop, + layout->list[i].xlator->name, loc->path); + if (--cnt == 0) { + layout->list[i].stop = 0xffffffff; + break; + } + } + } +} + + +int +dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc, + dht_layout_t *layout) +{ + dht_conf_t *conf = NULL; + xlator_t *this = NULL; + dht_local_t *local = NULL; + int missing = -1; + int down = -1; + int holes = -1; + int ret = -1; + int i = -1; + + this = frame->this; + conf = this->private; + local = frame->local; + + missing = local->selfheal.missing; + down = local->selfheal.down; + holes = local->selfheal.hole_cnt; + + if ((missing + down) == conf->subvolume_cnt) { + dht_selfheal_fix_this_virgin (frame, loc, layout); + ret = 0; + } + + if (holes <= down) { + /* the down subvol might fill up the holes */ + ret = 0; + } + + for (i = 0; i < layout->cnt; i++) { + /* directory not present */ + if (layout->list[i].err == ENOENT) { + ret = 0; + break; + } + } + + /* TODO: give a fix to these non-virgins */ + + return ret; +} + + +int +dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, + loc_t *loc, dht_layout_t *layout) +{ + dht_local_t *local = NULL; + uint32_t holes = 0; + uint32_t overlaps = 0; + uint32_t missing = 0; + uint32_t down = 0; + uint32_t misc = 0; + int ret = 0; + xlator_t *this = NULL; + + + local = frame->local; + this = frame->this; + + ret = dht_layout_anomalies (this, loc, layout, + &local->selfheal.hole_cnt, + &local->selfheal.overlaps_cnt, + &local->selfheal.missing, + &local->selfheal.down, + &local->selfheal.misc); + + holes = local->selfheal.hole_cnt; + overlaps = local->selfheal.overlaps_cnt; + missing = local->selfheal.missing; + down = local->selfheal.down; + misc = local->selfheal.misc; + + local->selfheal.dir_cbk = dir_cbk; + local->selfheal.layout = layout; + +/* + if (down) { + gf_log (this->name, GF_LOG_ERROR, + "%d subvolumes down -- not fixing", down); + ret = 0; + goto sorry_no_fix; + } + + if (overlaps) { + gf_log (this->name, GF_LOG_ERROR, + "not fixing overlaps in %s", loc->path); + local->op_errno = EINVAL; + ret = -1; + goto sorry_no_fix; + } + + if (misc) { + gf_log (this->name, GF_LOG_ERROR, + "%d subvolumes have unrecoverable errors", misc); + ret = 0; + goto sorry_no_fix; + } + + if (holes > missing) { + gf_log (this->name, GF_LOG_ERROR, + "%d holes and %d pigeons -- not fixing", + holes, missing); + ret = 0; + goto sorry_no_fix; + } +*/ + ret = dht_selfheal_dir_getafix (frame, loc, layout); + + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "the directory is not a virgin"); + goto sorry_no_fix; + } + + dht_selfheal_dir_mkdir (frame, loc, layout, 0); + + return 0; + +sorry_no_fix: + /* TODO: need to put appropriate local->op_errno */ + dht_selfheal_dir_finish (frame, this, ret); + + return 0; +} + + +int +dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, + loc_t *loc, dht_layout_t *layout) +{ + int ret = 0; + dht_local_t *local = NULL; + + + local = frame->local; + + local->selfheal.dir_cbk = dir_cbk; + local->selfheal.layout = layout; + + ret = dht_selfheal_dir_mkdir (frame, loc, layout, 1); + + return 0; +} diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c new file mode 100644 index 000000000..836e7a4e8 --- /dev/null +++ b/xlators/cluster/dht/src/dht.c @@ -0,0 +1,222 @@ +/* + Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +/* TODO: add NS locking */ + +#include "dht-common.c" + +/* TODO: + - use volumename in xattr instead of "dht" + - use NS locks + - handle all cases in self heal layout reconstruction + - complete linkfile selfheal +*/ + + + +int +notify (xlator_t *this, int event, void *data, ...) +{ + int ret = -1; + + ret = dht_notify (this, event, data); + + return ret; +} + +void +fini (xlator_t *this) +{ + int i = 0; + dht_conf_t *conf = NULL; + + conf = this->private; + + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + FREE (conf->file_layouts[i]); + } + FREE (conf->file_layouts); + } + + if (conf->default_dir_layout) + FREE (conf->default_dir_layout); + + if (conf->subvolumes) + FREE (conf->subvolumes); + + if (conf->subvolume_status) + FREE (conf->subvolume_status); + + FREE (conf); + } + + return; +} + +int +init (xlator_t *this) +{ + dht_conf_t *conf = NULL; + char *lookup_unhashed_str = NULL; + int ret = -1; + int i = 0; + + if (!this->children) { + gf_log (this->name, GF_LOG_ERROR, + "DHT needs more than one child defined"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + conf = CALLOC (1, sizeof (*conf)); + if (!conf) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + conf->search_unhashed = 0; + + if (dict_get_str (this->options, "lookup-unhashed", + &lookup_unhashed_str) == 0) { + gf_string2boolean (lookup_unhashed_str, + &conf->search_unhashed); + } + + ret = dht_init_subvolumes (this, conf); + if (ret == -1) { + goto err; + } + + ret = dht_layouts_init (this, conf); + if (ret == -1) { + goto err; + } + + LOCK_INIT (&conf->subvolume_lock); + + conf->gen = 1; + + this->private = conf; + + return 0; + +err: + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + FREE (conf->file_layouts[i]); + } + FREE (conf->file_layouts); + } + + if (conf->default_dir_layout) + FREE (conf->default_dir_layout); + + if (conf->subvolumes) + FREE (conf->subvolumes); + + if (conf->subvolume_status) + FREE (conf->subvolume_status); + + FREE (conf); + } + + return -1; +} + + +struct xlator_fops fops = { + .lookup = dht_lookup, + .mknod = dht_mknod, + .create = dht_create, + + .stat = dht_stat, + .chmod = dht_chmod, + .chown = dht_chown, + .fchown = dht_fchown, + .fchmod = dht_fchmod, + .fstat = dht_fstat, + .utimens = dht_utimens, + .truncate = dht_truncate, + .ftruncate = dht_ftruncate, + .access = dht_access, + .readlink = dht_readlink, + .setxattr = dht_setxattr, + .getxattr = dht_getxattr, + .removexattr = dht_removexattr, + .open = dht_open, + .readv = dht_readv, + .writev = dht_writev, + .flush = dht_flush, + .fsync = dht_fsync, + .statfs = dht_statfs, + .lk = dht_lk, + .opendir = dht_opendir, + .readdir = dht_readdir, + .fsyncdir = dht_fsyncdir, + .symlink = dht_symlink, + .unlink = dht_unlink, + .link = dht_link, + .mkdir = dht_mkdir, + .rmdir = dht_rmdir, + .rename = dht_rename, + .inodelk = dht_inodelk, + .finodelk = dht_finodelk, + .entrylk = dht_entrylk, + .fentrylk = dht_fentrylk, + .xattrop = dht_xattrop, + .fxattrop = dht_fxattrop, +#if 0 + .setdents = dht_setdents, + .getdents = dht_getdents, + .checksum = dht_checksum, +#endif +}; + + +struct xlator_mops mops = { +}; + + +struct xlator_cbks cbks = { +// .release = dht_release, +// .releasedir = dht_releasedir, + .forget = dht_forget +}; + + +struct volume_options options[] = { + { .key = {"lookup-unhashed"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {NULL} }, +}; diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c new file mode 100644 index 000000000..6333e002f --- /dev/null +++ b/xlators/cluster/dht/src/nufa.c @@ -0,0 +1,684 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "dht-common.c" + +/* TODO: all 'TODO's in dht.c holds good */ + +int +nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf, dict_t *xattr) +{ + dht_layout_t *layout = NULL; + xlator_t *subvol = NULL; + char is_linkfile = 0; + char is_dir = 0; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + loc_t *loc = NULL; + int i = 0; + call_frame_t *prev = NULL; + int call_cnt = 0; + + + conf = this->private; + + prev = cookie; + local = frame->local; + loc = &local->loc; + + if (ENTRY_MISSING (op_ret, op_errno)) { + if (conf->search_unhashed) { + local->op_errno = ENOENT; + dht_lookup_everywhere (frame, this, loc); + return 0; + } + } + + if (op_ret == -1) + goto out; + + is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_dir = check_is_dir (inode, stbuf, xattr); + + if (!is_dir && !is_linkfile) { + /* non-directory and not a linkfile */ + + dht_itransform (this, prev->this, stbuf->st_ino, + &stbuf->st_ino); + + layout = dht_layout_for_subvol (this, prev->this); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no pre-set layout for subvolume %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto err; + } + + inode_ctx_put (inode, this, (uint64_t)(long)layout); + goto out; + } + + if (is_dir) { + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + local->inode = inode_ref (inode); + local->xattr = dict_ref (xattr); + + local->op_ret = 0; + local->op_errno = 0; + + local->layout = dht_layout_new (this, conf->subvolume_cnt); + if (!local->layout) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_lookup_dir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + &local->loc, local->xattr_req); + } + } + + if (is_linkfile) { + subvol = dht_linkfile_subvol (this, inode, stbuf, xattr); + + if (!subvol) { + gf_log (this->name, GF_LOG_WARNING, + "linkfile not having link subvolume. path=%s", + loc->path); + dht_lookup_everywhere (frame, this, loc); + return 0; + } + + STACK_WIND (frame, dht_lookup_linkfile_cbk, + subvol, subvol->fops->lookup, + &local->loc, local->xattr_req); + } + + return 0; + +out: + if (!local->hashed_subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + local->loc.path); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_lookup_cbk, + local->hashed_subvol, local->hashed_subvol->fops->lookup, + &local->loc, local->xattr_req); + + return 0; + + err: + DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, xattr); + return 0; +} + +int +nufa_lookup (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *xattr_req) +{ + xlator_t *hashed_subvol = NULL; + xlator_t *cached_subvol = NULL; + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int op_errno = -1; + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + ret = loc_dup (loc, &local->loc); + if (ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "copying location failed for path=%s", + loc->path); + goto err; + } + + if (xattr_req) { + local->xattr_req = dict_ref (xattr_req); + } else { + local->xattr_req = dict_new (); + } + + hashed_subvol = dht_subvol_get_hashed (this, &local->loc); + cached_subvol = dht_subvol_get_cached (this, local->loc.inode); + + local->cached_subvol = cached_subvol; + local->hashed_subvol = hashed_subvol; + + if (is_revalidate (loc)) { + layout = dht_layout_get (this, loc->inode); + + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "revalidate without cache. path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + + if (layout->gen && (layout->gen < conf->gen)) { + gf_log (this->name, GF_LOG_WARNING, + "incomplete layout failure for path=%s", + loc->path); + op_errno = EAGAIN; + goto err; + } + + local->inode = inode_ref (loc->inode); + local->st_ino = loc->inode->ino; + + local->call_cnt = layout->cnt; + call_cnt = local->call_cnt; + + /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute, + * revalidates directly go to the cached-subvolume. + */ + ret = dict_set_uint32 (local->xattr_req, + "trusted.glusterfs.dht", 4 * 4); + + for (i = 0; i < layout->cnt; i++) { + subvol = layout->list[i].xlator; + + STACK_WIND (frame, dht_revalidate_cbk, + subvol, subvol->fops->lookup, + loc, local->xattr_req); + + if (!--call_cnt) + break; + } + } else { + ret = dict_set_uint32 (local->xattr_req, + "trusted.glusterfs.dht", 4 * 4); + + ret = dict_set_uint32 (local->xattr_req, + "trusted.glusterfs.dht.linkto", 256); + + /* Send it to only local volume */ + STACK_WIND (frame, nufa_local_lookup_cbk, + conf->local_volume, + conf->local_volume->fops->lookup, + loc, local->xattr_req); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int +nufa_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + dht_conf_t *conf = NULL; + + local = frame->local; + prev = cookie; + conf = this->private; + + if (op_ret == -1) + goto err; + + STACK_WIND (frame, dht_create_cbk, + conf->local_volume, conf->local_volume->fops->create, + &local->loc, local->flags, local->mode, local->fd); + + return 0; + + err: + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int +nufa_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, fd_t *fd) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; + int op_errno = -1; + int ret = -1; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + subvol = dht_subvol_get_hashed (this, loc); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + if (subvol != conf->local_volume) { + /* create a link file instead of actual file */ + ret = loc_copy (&local->loc, loc); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->fd = fd_ref (fd); + local->mode = mode; + local->flags = flags; + + dht_linkfile_create (frame, nufa_create_linkfile_create_cbk, + conf->local_volume, subvol, loc); + return 0; + } + + gf_log (this->name, GF_LOG_DEBUG, + "creating %s on %s", loc->path, subvol->name); + + STACK_WIND (frame, dht_create_cbk, + subvol, subvol->fops->create, + loc, flags, mode, fd); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +int +nufa_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + dht_conf_t *conf = NULL; + + local = frame->local; + prev = cookie; + conf = this->private; + + if (op_ret >= 0) { + STACK_WIND (frame, dht_newfile_cbk, + conf->local_volume, + conf->local_volume->fops->mknod, + &local->loc, local->mode, local->rdev); + + return 0; + } + + DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf); + return 0; +} + + +int +nufa_mknod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, dev_t rdev) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; + int op_errno = -1; + int ret = -1; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + + conf = this->private; + + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + subvol = dht_subvol_get_hashed (this, loc); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + + if (conf->local_volume != subvol) { + /* Create linkfile first */ + ret = loc_copy (&local->loc, loc); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->mode = mode; + local->rdev = rdev; + + dht_linkfile_create (frame, nufa_mknod_linkfile_cbk, + conf->local_volume, subvol, loc); + return 0; + } + + gf_log (this->name, GF_LOG_DEBUG, + "creating %s on %s", loc->path, subvol->name); + + STACK_WIND (frame, dht_newfile_cbk, + subvol, subvol->fops->mknod, + loc, mode, rdev); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int +notify (xlator_t *this, int event, void *data, ...) +{ + int ret = -1; + + ret = dht_notify (this, event, data); + + return ret; +} + +void +fini (xlator_t *this) +{ + int i = 0; + dht_conf_t *conf = NULL; + + conf = this->private; + + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + FREE (conf->file_layouts[i]); + } + FREE (conf->file_layouts); + } + + if (conf->default_dir_layout) + FREE (conf->default_dir_layout); + + if (conf->subvolumes) + FREE (conf->subvolumes); + + if (conf->subvolume_status) + FREE (conf->subvolume_status); + + FREE (conf); + } + + return; +} + +int +init (xlator_t *this) +{ + dht_conf_t *conf = NULL; + xlator_list_t *trav = NULL; + data_t *data = NULL; + char *local_volname = NULL; + char *lookup_unhashed_str = NULL; + int ret = -1; + int i = 0; + char my_hostname[256]; + + if (!this->children) { + gf_log (this->name, GF_LOG_ERROR, + "DHT needs more than one child defined"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + conf = CALLOC (1, sizeof (*conf)); + if (!conf) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + conf->search_unhashed = 0; + + if (dict_get_str (this->options, "lookup-unhashed", + &lookup_unhashed_str) == 0) { + gf_string2boolean (lookup_unhashed_str, + &conf->search_unhashed); + } + + ret = dht_init_subvolumes (this, conf); + if (ret == -1) { + goto err; + } + + ret = dht_layouts_init (this, conf); + if (ret == -1) { + goto err; + } + + LOCK_INIT (&conf->subvolume_lock); + + conf->gen = 1; + + local_volname = "localhost"; + ret = gethostname (my_hostname, 256); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "could not find hostname (%s)", + strerror (errno)); + } + + if (ret == 0) + local_volname = my_hostname; + + data = dict_get (this->options, "local-volume-name"); + if (data) { + local_volname = data->data; + } + + trav = this->children; + while (trav) { + if (strcmp (trav->xlator->name, local_volname) == 0) + break; + trav = trav->next; + } + + if (!trav) { + gf_log (this->name, GF_LOG_ERROR, + "Could not find subvolume named '%s'. " + "Please define volume with the name as the hostname " + "or override it with 'option local-volume-name'", + local_volname); + goto err; + } + /* The volume specified exists */ + conf->local_volume = trav->xlator; + + this->private = conf; + + return 0; + +err: + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + FREE (conf->file_layouts[i]); + } + FREE (conf->file_layouts); + } + + if (conf->default_dir_layout) + FREE (conf->default_dir_layout); + + if (conf->subvolumes) + FREE (conf->subvolumes); + + if (conf->subvolume_status) + FREE (conf->subvolume_status); + + FREE (conf); + } + + return -1; +} + + +struct xlator_fops fops = { + .lookup = nufa_lookup, + .create = nufa_create, + .mknod = nufa_mknod, + + .stat = dht_stat, + .chmod = dht_chmod, + .chown = dht_chown, + .fchown = dht_fchown, + .fchmod = dht_fchmod, + .fstat = dht_fstat, + .utimens = dht_utimens, + .truncate = dht_truncate, + .ftruncate = dht_ftruncate, + .access = dht_access, + .readlink = dht_readlink, + .setxattr = dht_setxattr, + .getxattr = dht_getxattr, + .removexattr = dht_removexattr, + .open = dht_open, + .readv = dht_readv, + .writev = dht_writev, + .flush = dht_flush, + .fsync = dht_fsync, + .statfs = dht_statfs, + .lk = dht_lk, + .opendir = dht_opendir, + .readdir = dht_readdir, + .fsyncdir = dht_fsyncdir, + .symlink = dht_symlink, + .unlink = dht_unlink, + .link = dht_link, + .mkdir = dht_mkdir, + .rmdir = dht_rmdir, + .rename = dht_rename, + .inodelk = dht_inodelk, + .finodelk = dht_finodelk, + .entrylk = dht_entrylk, + .fentrylk = dht_fentrylk, + .xattrop = dht_xattrop, + .fxattrop = dht_fxattrop, +#if 0 + .setdents = dht_setdents, + .getdents = dht_getdents, + .checksum = dht_checksum, +#endif +}; + + +struct xlator_mops mops = { +}; + + +struct xlator_cbks cbks = { +// .release = dht_release, +// .releasedir = dht_releasedir, + .forget = dht_forget +}; + + +struct volume_options options[] = { + { .key = {"local-volume-name"}, + .type = GF_OPTION_TYPE_XLATOR + }, + { .key = {"lookup-unhashed"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {NULL} }, +}; diff --git a/xlators/cluster/ha/Makefile.am b/xlators/cluster/ha/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/cluster/ha/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/ha/src/Makefile.am b/xlators/cluster/ha/src/Makefile.am new file mode 100644 index 000000000..069a0dcde --- /dev/null +++ b/xlators/cluster/ha/src/Makefile.am @@ -0,0 +1,15 @@ +xlator_LTLIBRARIES = ha.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +ha_la_LDFLAGS = -module -avoidversion + +ha_la_SOURCES = ha-helpers.c ha.c +ha_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = ha.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + diff --git a/xlators/cluster/ha/src/ha-helpers.c b/xlators/cluster/ha/src/ha-helpers.c new file mode 100644 index 000000000..8193caf27 --- /dev/null +++ b/xlators/cluster/ha/src/ha-helpers.c @@ -0,0 +1,191 @@ +/* + Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include "xlator.h" +#include "call-stub.h" +#include "defaults.h" +#include "dict.h" +#include "compat-errno.h" +#include "ha.h" + +int ha_alloc_init_fd (call_frame_t *frame, fd_t *fd) +{ + ha_local_t *local = NULL; + int i = -1; + ha_private_t *pvt = NULL; + int child_count = 0; + int ret = -1; + hafd_t *hafdp = NULL; + xlator_t *this = NULL; + uint64_t tmp_hafdp = 0; + + this = frame->this; + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + + if (local == NULL) { + ret = fd_ctx_get (fd, this, &tmp_hafdp); + if (ret < 0) { + goto out; + } + hafdp = (hafd_t *)(long)tmp_hafdp; + local = frame->local = CALLOC (1, sizeof (*local)); + if (local == NULL) { + ret = -ENOMEM; + goto out; + } + local->state = CALLOC (1, child_count); + if (local->state == NULL) { + ret = -ENOMEM; + goto out; + } + + /* take care of the preferred subvolume */ + if (pvt->pref_subvol == -1) + local->active = hafdp->active; + else + local->active = pvt->pref_subvol; + + LOCK (&hafdp->lock); + memcpy (local->state, hafdp->fdstate, child_count); + UNLOCK (&hafdp->lock); + + /* in case the preferred subvolume is down */ + if ((local->active != -1) && (local->state[local->active] == 0)) + local->active = -1; + + for (i = 0; i < child_count; i++) { + if (local->state[i]) { + if (local->active == -1) + local->active = i; + local->tries++; + } + } + if (local->active == -1) { + ret = -ENOTCONN; + goto out; + } + local->fd = fd_ref (fd); + } + ret = 0; +out: + return ret; +} + +int ha_handle_cbk (call_frame_t *frame, void *cookie, int op_ret, int op_errno) +{ + xlator_t *xl = NULL; + ha_private_t *pvt = NULL; + xlator_t **children = NULL; + int prev_child = -1; + hafd_t *hafdp = NULL; + int ret = -1; + call_stub_t *stub = NULL; + ha_local_t *local = NULL; + uint64_t tmp_hafdp = 0; + + xl = frame->this; + pvt = xl->private; + children = pvt->children; + prev_child = (long) cookie; + local = frame->local; + + if (op_ret == -1) { + gf_log (xl->name, GF_LOG_ERROR ,"(child=%s) (op_ret=%d op_errno=%s)", + children[prev_child]->name, op_ret, strerror (op_errno)); + } + if (op_ret == -1 && (op_errno == ENOTCONN)) { + ret = 0; + if (local->fd) { + ret = fd_ctx_get (local->fd, xl, &tmp_hafdp); + } + hafdp = (hafd_t *)(long)tmp_hafdp; + if (ret == 0) { + if (local->fd) { + LOCK(&hafdp->lock); + hafdp->fdstate[prev_child] = 0; + UNLOCK(&hafdp->lock); + } + local->tries--; + if (local->tries != 0) { + while (1) { + local->active = (local->active + 1) % pvt->child_count; + if (local->state[local->active]) + break; + } + stub = local->stub; + local->stub = NULL; + call_resume (stub); + return -1; + } + } + } + if (local->stub) + call_stub_destroy (local->stub); + if (local->fd) { + FREE (local->state); + fd_unref (local->fd); + } + return 0; +} + +int ha_alloc_init_inode (call_frame_t *frame, inode_t *inode) +{ + int i = -1; + ha_private_t *pvt = NULL; + xlator_t *xl = NULL; + int ret = -1; + ha_local_t *local = NULL; + uint64_t tmp_state = 0; + + xl = frame->this; + pvt = xl->private; + local = frame->local; + + if (local == NULL) { + local = frame->local = CALLOC (1, sizeof (*local)); + if (local == NULL) { + ret = -ENOMEM; + goto out; + } + local->active = pvt->pref_subvol; + ret = inode_ctx_get (inode, xl, &tmp_state); + if (ret < 0) { + goto out; + } + local->state = (char *)(long)tmp_state; + if (local->active != -1 && local->state[local->active] == 0) + local->active = -1; + for (i = 0; i < pvt->child_count; i++) { + if (local->state[i]) { + if (local->active == -1) + local->active = i; + local->tries++; + } + } + if (local->active == -1) { + ret = -ENOTCONN; + goto out; + } + } + ret = 0; +out: + return ret; +} diff --git a/xlators/cluster/ha/src/ha.c b/xlators/cluster/ha/src/ha.c new file mode 100644 index 000000000..4542bdc7e --- /dev/null +++ b/xlators/cluster/ha/src/ha.c @@ -0,0 +1,3479 @@ +/* + Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +/* generate errors randomly, code is simple now, better alogorithm + * can be written to decide what error to be returned and when + */ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "call-stub.h" +#include "defaults.h" +#include "dict.h" +#include "compat-errno.h" +#include "ha.h" + +/* + * TODO: + * - dbench fails if ha over server side afr + * - lock calls - lock on all subvols. + * - support preferred-subvolume option. code already there. + * - do not alloc the call-stub in case only one subvol is up. + */ + +int +ha_forget (xlator_t *this, + inode_t *inode) +{ + uint64_t stateino = 0; + char *state = NULL; + if (!inode_ctx_del (inode, this, &stateino)) { + state = ((char *)(long)stateino); + FREE (state); + } + + return 0; + +} + +int32_t +ha_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + int child_count = 0, i = 0, callcnt = 0; + char *state = NULL; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + uint64_t tmp_state = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + prev_frame = cookie; + children = pvt->children; + + for (i = 0; i < child_count; i++) { + if (pvt->children[i] == prev_frame->this) + break; + } + if ((op_ret == -1) && (op_errno != ENOENT)) { + gf_log (this->name, GF_LOG_ERROR, "(child=%s) (op_ret=%d op_errno=%s)", + children[i]->name, op_ret, strerror (op_errno)); + } + inode_ctx_get (local->inode, this, &tmp_state); + state = (char *)(long)tmp_state; + + LOCK (&frame->lock); + if (local->revalidate == 1) { + if ((!op_ret) != state[i]) { + local->revalidate_error = 1; + gf_log (this->name, GF_LOG_DEBUG, "revalidate error on %s", + pvt->children[i]->name); + } + } else { + if (op_ret == 0) { + state[i] = 1; + } + } + if (local->op_ret == -1 && op_ret == 0) { + local->op_ret = 0; + local->buf = *buf; + if (dict) + local->dict = dict_ref (dict); + } + if (op_ret == -1 && op_ret != ENOTCONN) + local->op_errno = op_errno; + callcnt = --local->call_count; + UNLOCK (&frame->lock); + + if (callcnt == 0) { + dict_t *ctx = local->dict; + inode_t *inode = local->inode; + if (local->revalidate_error == 1) { + local->op_ret = -1; + local->op_errno = EIO; + gf_log (this->name, GF_LOG_DEBUG, "revalidate error, returning EIO"); + } + STACK_UNWIND (frame, + local->op_ret, + local->op_errno, + inode, + &local->buf, + ctx); + if (inode) + inode_unref (inode); + if (ctx) + dict_unref (ctx); + } + return 0; +} + +int32_t +ha_lookup (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + int child_count = 0, i = 0; + char *state = NULL; + xlator_t **children = NULL; + int ret = -1; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + children = pvt->children; + + frame->local = local = CALLOC (1, sizeof (*local)); + child_count = pvt->child_count; + local->inode = inode_ref (loc->inode); + + ret = inode_ctx_get (loc->inode, this, NULL); + if (ret) { + state = CALLOC (1, child_count); + inode_ctx_put (loc->inode, this, (uint64_t)(long)state); + } else + local->revalidate = 1; + + local->op_ret = -1; + local->op_errno = ENOTCONN; + local->call_count = child_count; + + for (i = 0; i < child_count; i++) { + STACK_WIND (frame, + ha_lookup_cbk, + children[i], + children[i]->fops->lookup, + loc, + xattr_req); + } + return 0; +} + + int32_t +ha_stat_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + } + return 0; +} + +int32_t +ha_stat (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + ha_local_t *local = NULL; + int op_errno = ENOTCONN; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_stat_stub (frame, ha_stat, loc); + + STACK_WIND_COOKIE (frame, + ha_stat_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->stat, + loc); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + + int32_t +ha_chmod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + } + return 0; +} + +int32_t +ha_chmod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_chmod_stub (frame, ha_chmod, loc, mode); + + STACK_WIND_COOKIE (frame, + ha_chmod_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->chmod, + loc, + mode); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + + int32_t +ha_fchmod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + } + return 0; +} + +int32_t +ha_fchmod (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + mode_t mode) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_fchmod_stub (frame, ha_fchmod, fd, mode); + + STACK_WIND_COOKIE (frame, + ha_fchmod_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->fchmod, + fd, + mode); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + + int32_t +ha_chown_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + } + return 0; +} + +int32_t +ha_chown (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + uid_t uid, + gid_t gid) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_chown_stub (frame, ha_chown, loc, uid, gid); + + STACK_WIND_COOKIE (frame, + ha_chown_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->chown, + loc, + uid, + gid); + return 0; +err: + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; +} + + int32_t +ha_fchown_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + } + return 0; +} + +int32_t +ha_fchown (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + uid_t uid, + gid_t gid) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_fchown_stub (frame, ha_fchown, fd, uid, gid); + + STACK_WIND_COOKIE (frame, + ha_fchown_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->fchown, + fd, + uid, + gid); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + + int32_t +ha_truncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + } + return 0; +} + +int32_t +ha_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_truncate_stub (frame, ha_truncate, loc, offset); + + STACK_WIND_COOKIE (frame, + ha_truncate_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->truncate, + loc, + offset); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + + int32_t +ha_ftruncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + } + return 0; +} + +int32_t +ha_ftruncate (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_ftruncate_stub (frame, ha_ftruncate, fd, offset); + + STACK_WIND_COOKIE (frame, + ha_ftruncate_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->ftruncate, + fd, + offset); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + +int32_t +ha_utimens_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + } + return 0; +} + +int32_t +ha_utimens (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec tv[2]) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_utimens_stub (frame, ha_utimens, loc, tv); + + STACK_WIND_COOKIE (frame, + ha_utimens_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->utimens, + loc, + tv); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + +int32_t +ha_access_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno); + } + return 0; +} + +int32_t +ha_access (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t mask) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_access_stub (frame, ha_access, loc, mask); + + STACK_WIND_COOKIE (frame, + ha_access_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->access, + loc, + mask); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + + + int32_t +ha_readlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + const char *path) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + path); + } + return 0; +} + +int32_t +ha_readlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + size_t size) +{ + ha_local_t *local = frame->local; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_readlink_stub (frame, ha_readlink, loc, size); + + STACK_WIND_COOKIE (frame, + ha_readlink_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->readlink, + loc, + size); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + +int +ha_mknod_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + char *stateino = NULL; + int child_count = 0, i = 0, cnt = 0, ret = 0; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + uint64_t tmp_stateino = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + prev_frame = cookie; + children = pvt->children; + + for (i = 0; i < child_count; i++) + if (prev_frame->this == children[i]) + break; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "(path=%s) (op_ret=%d op_errno=%d)", + local->stub->args.mknod.loc.path, op_ret, op_errno); + } + ret = inode_ctx_get (local->stub->args.mknod.loc.inode, + this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "unwind(-1), inode_ctx_get() error"); + /* It is difficult to handle this error at this stage + * as we still expect more cbks, we can't return as + * of now + */ + } else if (op_ret == 0) { + stateino[i] = 1; + } + LOCK (&frame->lock); + cnt = --local->call_count; + UNLOCK (&frame->lock); + + if (cnt == 0) { + call_stub_t *stub = local->stub; + FREE (local->state); + STACK_UNWIND (frame, + local->op_ret, + local->op_errno, + local->stub->args.mknod.loc.inode, + &local->buf); + call_stub_destroy (stub); + } + return 0; +} + +int32_t +ha_mknod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + char *stateino = NULL; + int child_count = 0, i = 0, cnt = 0, ret = 0; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + uint64_t tmp_stateino = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + prev_frame = cookie; + children = pvt->children; + + for (i = 0; i < child_count; i++) + if (prev_frame->this == children[i]) + break; + + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.mknod.loc.path, op_ret, op_errno); + } + + ret = inode_ctx_get (local->stub->args.mknod.loc.inode, + this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "inode_ctx_get() error"); + /* FIXME: handle the case */ + } + if (op_ret == 0) { + stateino[i] = 1; + local->op_ret = 0; + local->first_success = 1; + local->buf = *buf; + } + cnt = --local->call_count; + for (i = local->active + 1; i < child_count; i++) { + if (local->state[i]) + break; + } + + if (cnt == 0 || i == child_count) { + call_stub_t *stub = local->stub; + FREE (local->state); + stub = local->stub; + STACK_UNWIND (frame, local->op_ret, local->op_errno, local->stub->args.mknod.loc.inode, &local->buf); + call_stub_destroy (stub); + return 0; + } + + local->active = i; + + if (local->first_success == 0) { + STACK_WIND (frame, + ha_mknod_cbk, + children[i], + children[i]->fops->mknod, + &local->stub->args.mknod.loc, + local->stub->args.mknod.mode, + local->stub->args.mknod.rdev); + return 0; + } + cnt = local->call_count; + + for (; i < child_count; i++) { + if (local->state[i]) { + STACK_WIND (frame, + ha_mknod_lookup_cbk, + children[i], + children[i]->fops->lookup, + &local->stub->args.mknod.loc, + 0); + if (--cnt == 0) + break; + } + } + return 0; +} + +int32_t +ha_mknod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode, + dev_t rdev) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + int child_count = 0, i = 0; + char *stateino = NULL; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + + frame->local = local = CALLOC (1, sizeof (*local)); + local->stub = fop_mknod_stub (frame, ha_mknod, loc, mode, rdev); + local->op_ret = -1; + local->op_errno = ENOTCONN; + local->state = CALLOC (1, child_count); + memcpy (local->state, pvt->state, child_count); + local->active = -1; + + stateino = CALLOC (1, child_count); + inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino); + + for (i = 0; i < child_count; i++) { + if (local->state[i]) { + local->call_count++; + if (local->active == -1) + local->active = i; + } + } + + STACK_WIND (frame, + ha_mknod_cbk, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->mknod, + loc, mode, rdev); + return 0; +} + + +int +ha_mkdir_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + char *stateino = NULL; + int child_count = 0, i = 0, cnt = 0; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + uint64_t tmp_stateino = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + prev_frame = cookie; + children = pvt->children; + + for (i = 0; i < child_count; i++) + if (prev_frame->this == children[i]) + break; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.mkdir.loc.path, op_ret, op_errno); + } + inode_ctx_get (local->stub->args.mkdir.loc.inode, + this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + + if (op_ret == 0) + stateino[i] = 1; + + LOCK (&frame->lock); + cnt = --local->call_count; + UNLOCK (&frame->lock); + + if (cnt == 0) { + call_stub_t *stub = local->stub; + FREE (local->state); + STACK_UNWIND (frame, + local->op_ret, + local->op_errno, + local->stub->args.mkdir.loc.inode, + &local->buf); + call_stub_destroy (stub); + } + return 0; +} + +int32_t +ha_mkdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + char *stateino = NULL; + int child_count = 0, i = 0, cnt = 0; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + uint64_t tmp_stateino = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + prev_frame = cookie; + children = pvt->children; + + for (i = 0; i < child_count; i++) + if (prev_frame->this == children[i]) + break; + + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.mkdir.loc.path, op_ret, op_errno); + } + + inode_ctx_get (local->stub->args.mkdir.loc.inode, + this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + + if (op_ret == 0) { + stateino[i] = 1; + local->op_ret = 0; + local->first_success = 1; + local->buf = *buf; + } + cnt = --local->call_count; + for (i = local->active + 1; i < child_count; i++) { + if (local->state[i]) + break; + } + + if (cnt == 0 || i == child_count) { + call_stub_t *stub = local->stub; + FREE (local->state); + stub = local->stub; + STACK_UNWIND (frame, local->op_ret, local->op_errno, local->stub->args.mkdir.loc.inode, &local->buf); + call_stub_destroy (stub); + return 0; + } + + local->active = i; + + if (local->first_success == 0) { + STACK_WIND (frame, + ha_mkdir_cbk, + children[i], + children[i]->fops->mkdir, + &local->stub->args.mkdir.loc, + local->stub->args.mkdir.mode); + return 0; + } + cnt = local->call_count; + + for (; i < child_count; i++) { + if (local->state[i]) { + STACK_WIND (frame, + ha_mkdir_lookup_cbk, + children[i], + children[i]->fops->lookup, + &local->stub->args.mkdir.loc, + 0); + if (--cnt == 0) + break; + } + } + return 0; +} + +int32_t +ha_mkdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + int child_count = 0, i = 0; + char *stateino = NULL; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + + frame->local = local = CALLOC (1, sizeof (*local)); + local->stub = fop_mkdir_stub (frame, ha_mkdir, loc, mode); + local->op_ret = -1; + local->op_errno = ENOTCONN; + local->state = CALLOC (1, child_count); + memcpy (local->state, pvt->state, child_count); + local->active = -1; + + stateino = CALLOC (1, child_count); + inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino); + for (i = 0; i < child_count; i++) { + if (local->state[i]) { + local->call_count++; + if (local->active == -1) + local->active = i; + } + } + + STACK_WIND (frame, + ha_mkdir_cbk, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->mkdir, + loc, mode); + return 0; +} + + int32_t +ha_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) { + STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + +int32_t +ha_unlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_unlink_stub (frame, ha_unlink, loc); + + STACK_WIND_COOKIE (frame, + ha_unlink_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->unlink, + loc); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + + int32_t +ha_rmdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno); + } + return 0; +} + +int32_t +ha_rmdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + ha_local_t *local = frame->local; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_rmdir_stub (frame, ha_rmdir, loc); + + STACK_WIND_COOKIE (frame, + ha_rmdir_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->rmdir, + loc); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + + +int +ha_symlink_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + char *stateino = NULL; + int child_count = 0, i = 0, cnt = 0; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + uint64_t tmp_stateino = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + prev_frame = cookie; + children = pvt->children; + + for (i = 0; i < child_count; i++) + if (prev_frame->this == children[i]) + break; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.symlink.loc.path, op_ret, op_errno); + } + inode_ctx_get (local->stub->args.symlink.loc.inode, + this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + + if (op_ret == 0) + stateino[i] = 1; + + LOCK (&frame->lock); + cnt = --local->call_count; + UNLOCK (&frame->lock); + + if (cnt == 0) { + call_stub_t *stub = local->stub; + FREE (local->state); + STACK_UNWIND (frame, + local->op_ret, + local->op_errno, + local->stub->args.symlink.loc.inode, + &local->buf); + call_stub_destroy (stub); + } + return 0; +} + +int32_t +ha_symlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + char *stateino = NULL; + int child_count = 0, i = 0, cnt = 0; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + uint64_t tmp_stateino = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + prev_frame = cookie; + children = pvt->children; + + for (i = 0; i < child_count; i++) + if (prev_frame->this == children[i]) + break; + + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.symlink.loc.path, op_ret, op_errno); + } + inode_ctx_get (local->stub->args.symlink.loc.inode, + this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + + if (op_ret == 0) { + stateino[i] = 1; + local->op_ret = 0; + local->first_success = 1; + local->buf = *buf; + } + cnt = --local->call_count; + for (i = local->active + 1; i < child_count; i++) { + if (local->state[i]) + break; + } + + if (cnt == 0 || i == child_count) { + call_stub_t *stub = local->stub; + FREE (local->state); + stub = local->stub; + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->stub->args.symlink.loc.inode, &local->buf); + call_stub_destroy (stub); + return 0; + } + + local->active = i; + + if (local->first_success == 0) { + STACK_WIND (frame, + ha_symlink_cbk, + children[i], + children[i]->fops->symlink, + local->stub->args.symlink.linkname, + &local->stub->args.symlink.loc); + return 0; + } + cnt = local->call_count; + + for (; i < child_count; i++) { + if (local->state[i]) { + STACK_WIND (frame, + ha_symlink_lookup_cbk, + children[i], + children[i]->fops->lookup, + &local->stub->args.symlink.loc, + 0); + if (--cnt == 0) + break; + } + } + return 0; +} + +int32_t +ha_symlink (call_frame_t *frame, + xlator_t *this, + const char *linkname, + loc_t *loc) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + int child_count = 0, i = 0; + char *stateino = NULL; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + + frame->local = local = CALLOC (1, sizeof (*local)); + local->stub = fop_symlink_stub (frame, ha_symlink, linkname, loc); + local->op_ret = -1; + local->op_errno = ENOTCONN; + local->state = CALLOC (1, child_count); + memcpy (local->state, pvt->state, child_count); + local->active = -1; + + stateino = CALLOC (1, child_count); + inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino); + + for (i = 0; i < child_count; i++) { + if (local->state[i]) { + local->call_count++; + if (local->active == -1) { + local->active = i; + } + } + } + + STACK_WIND (frame, + ha_symlink_cbk, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->symlink, + linkname, loc); + return 0; +} + + int32_t +ha_rename_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, op_ret, op_errno, buf); + } + return 0; +} + +int32_t +ha_rename (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, oldloc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_rename_stub (frame, ha_rename, oldloc, newloc); + STACK_WIND_COOKIE (frame, + ha_rename_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->rename, + oldloc, newloc); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + +int +ha_link_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + char *stateino = NULL; + int child_count = 0, i = 0, cnt = 0; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + uint64_t tmp_stateino = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + prev_frame = cookie; + children = pvt->children; + + for (i = 0; i < child_count; i++) + if (prev_frame->this == children[i]) + break; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.link.newloc.path, op_ret, op_errno); + } + inode_ctx_get (local->stub->args.link.newloc.inode, + this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + + if (op_ret == 0) + stateino[i] = 1; + + LOCK (&frame->lock); + cnt = --local->call_count; + UNLOCK (&frame->lock); + + if (cnt == 0) { + call_stub_t *stub = local->stub; + FREE (local->state); + STACK_UNWIND (frame, + local->op_ret, + local->op_errno, + local->stub->args.link.oldloc.inode, + &local->buf); + call_stub_destroy (stub); + } + return 0; +} + +int32_t +ha_link_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + char *stateino = NULL; + int child_count = 0, i = 0, cnt = 0; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + uint64_t tmp_stateino = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + prev_frame = cookie; + children = pvt->children; + + for (i = 0; i < child_count; i++) + if (prev_frame->this == children[i]) + break; + + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.link.newloc.path, op_ret, op_errno); + } + inode_ctx_get (local->stub->args.link.newloc.inode, + this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + + if (op_ret == 0) { + stateino[i] = 1; + local->op_ret = 0; + local->first_success = 1; + local->buf = *buf; + } + cnt = --local->call_count; + for (i = local->active + 1; i < child_count; i++) { + if (local->state[i]) + break; + } + + if (cnt == 0 || i == child_count) { + call_stub_t *stub = local->stub; + FREE (local->state); + stub = local->stub; + STACK_UNWIND (frame, local->op_ret, local->op_errno, local->stub->args.link.oldloc.inode, &local->buf); + call_stub_destroy (stub); + return 0; + } + + local->active = i; + + if (local->first_success == 0) { + STACK_WIND (frame, + ha_link_cbk, + children[i], + children[i]->fops->link, + &local->stub->args.link.oldloc, + &local->stub->args.link.newloc); + return 0; + } + cnt = local->call_count; + + for (; i < child_count; i++) { + if (local->state[i]) { + STACK_WIND (frame, + ha_link_lookup_cbk, + children[i], + children[i]->fops->lookup, + &local->stub->args.link.newloc, + 0); + if (--cnt == 0) + break; + } + } + return 0; +} + +int32_t +ha_link (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + int child_count = 0, i = 0; + char *stateino = NULL; + int32_t ret = 0; + uint64_t tmp_stateino = 0; + + ret = inode_ctx_get (newloc->inode, this, &tmp_stateino); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "dict_ptr_error()"); + } + stateino = (char *)(long)tmp_stateino; + + if (stateino == NULL) { + gf_log (this->name, GF_LOG_ERROR, + "newloc->inode's ctx is NULL, returning EINVAL"); + STACK_UNWIND (frame, -1, EINVAL, oldloc->inode, NULL); + return 0; + } + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + + frame->local = local = CALLOC (1, sizeof (*local)); + local->stub = fop_link_stub (frame, ha_link, oldloc, newloc); + local->op_ret = -1; + local->op_errno = ENOTCONN; + local->state = CALLOC (1, child_count); + memcpy (local->state, pvt->state, child_count); + local->active = -1; + + for (i = 0; i < child_count; i++) { + if (local->state[i]) { + local->call_count++; + if (local->active == -1) + local->active = i; + } + } + + STACK_WIND (frame, + ha_link_cbk, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->link, + oldloc, + newloc); + return 0; +} + +int32_t +ha_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + int i, child_count = 0, cnt = 0, ret = 0; + char *stateino = NULL; + hafd_t *hafdp = NULL; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + uint64_t tmp_stateino = 0; + uint64_t tmp_hafdp = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + prev_frame = cookie; + children = pvt->children; + + ret = inode_ctx_get (local->stub->args.create.loc.inode, + this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "dict_to_ptr() error"); + /* FIXME: handle */ + } + ret = fd_ctx_get (local->stub->args.create.fd, this, &tmp_hafdp); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "dict_to_ptr() error"); + /* FIXME: handle */ + } + hafdp = (hafd_t *)(long)tmp_hafdp; + + for (i = 0; i < child_count; i++) { + if (prev_frame->this == children[i]) + break; + } + + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.create.loc.path, op_ret, op_errno); + } + if (op_ret != -1) { + stateino[i] = 1; + hafdp->fdstate[i] = 1; + if (local->op_ret == -1) { + local->op_ret = 0; + local->buf = *buf; + local->first_success = 1; + } + local->stub->args.create.flags &= (~O_EXCL); + } + LOCK (&frame->lock); + cnt = --local->call_count; + UNLOCK (&frame->lock); + + for (i = local->active + 1; i < child_count; i++) { + if (local->state[i]) + break; + } + + if (cnt == 0 || i == child_count) { + char *state = local->state; + call_stub_t *stub = local->stub; + STACK_UNWIND (frame, local->op_ret, local->op_errno, + stub->args.create.fd, + stub->args.create.loc.inode, &local->buf); + FREE (state); + call_stub_destroy (stub); + return 0; + } + local->active = i; + cnt = local->call_count; + for (; i < child_count; i++) { + if (local->state[i]) { + STACK_WIND (frame, + ha_create_cbk, + children[i], + children[i]->fops->create, + &local->stub->args.create.loc, + local->stub->args.create.flags, + local->stub->args.create.mode, + local->stub->args.create.fd); + if ((local->first_success == 0) || (cnt == 0)) + break; + } + } + return 0; +} + +int32_t +ha_create (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + mode_t mode, fd_t *fd) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + int i, child_count = 0; + char *stateino = NULL; + xlator_t **children = NULL; + hafd_t *hafdp = NULL; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + children = pvt->children; + + if (local == NULL) { + local = frame->local = CALLOC (1, sizeof (*local)); + local->stub = fop_create_stub (frame, ha_create, loc, flags, mode, fd); + local->state = CALLOC (1, child_count); + local->active = -1; + local->op_ret = -1; + local->op_errno = ENOTCONN; + memcpy (local->state, pvt->state, child_count); + + for (i = 0; i < pvt->child_count; i++) { + if (local->state[i]) { + local->call_count++; + if (local->active == -1) + local->active = i; + } + } + /* FIXME handle active -1 */ + stateino = CALLOC (1, child_count); + hafdp = CALLOC (1, sizeof (*hafdp)); + hafdp->fdstate = CALLOC (1, child_count); + hafdp->path = strdup(loc->path); + LOCK_INIT (&hafdp->lock); + fd_ctx_set (fd, this, (uint64_t)(long)hafdp); + inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino); + } + + STACK_WIND (frame, + ha_create_cbk, + children[local->active], + children[local->active]->fops->create, + loc, flags, mode, fd); + return 0; +} + + int32_t +ha_open_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + xlator_t **children = NULL; + int i = 0, child_count = 0, callcnt = 0, ret = 0; + call_frame_t *prev_frame = NULL; + hafd_t *hafdp = NULL; + uint64_t tmp_hafdp = 0; + + local = frame->local; + pvt = this->private; + children = pvt->children; + child_count = pvt->child_count; + prev_frame = cookie; + + ret = fd_ctx_get (local->fd, this, &tmp_hafdp); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "dict_ptr_error()"); + } + hafdp = (hafd_t *)(long)tmp_hafdp; + + for (i = 0; i < child_count; i++) + if (children[i] == prev_frame->this) + break; + LOCK (&frame->lock); + if (op_ret != -1) { + hafdp->fdstate[i] = 1; + local->op_ret = 0; + } + if (op_ret == -1 && op_errno != ENOTCONN) + local->op_errno = op_errno; + callcnt = --local->call_count; + UNLOCK (&frame->lock); + + if (callcnt == 0) { + STACK_UNWIND (frame, + local->op_ret, + local->op_errno, + local->fd); + } + return 0; +} + +int32_t +ha_open (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, fd_t *fd) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + char *stateino = NULL; + xlator_t **children = NULL; + int cnt = 0, i, child_count = 0, ret = 0; + hafd_t *hafdp = NULL; + uint64_t tmp_stateino = 0; + + local = frame->local; + pvt = this->private; + children = pvt->children; + child_count = pvt->child_count; + + + local = frame->local = CALLOC (1, sizeof (*local)); + local->op_ret = -1; + local->op_errno = ENOTCONN; + local->fd = fd; + + hafdp = CALLOC (1, sizeof (*hafdp)); + hafdp->fdstate = CALLOC (1, child_count); + hafdp->path = strdup (loc->path); + hafdp->active = -1; + if (pvt->pref_subvol == -1) { + hafdp->active = fd->inode->ino % child_count; + } + + LOCK_INIT (&hafdp->lock); + fd_ctx_set (fd, this, (uint64_t)(long)hafdp); + ret = inode_ctx_get (loc->inode, this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + + for (i = 0; i < child_count; i++) + if (stateino[i]) + cnt++; + local->call_count = cnt; + for (i = 0; i < child_count; i++) { + if (stateino[i]) { + STACK_WIND (frame, + ha_open_cbk, + children[i], + children[i]->fops->open, + loc, flags, fd); + if (--cnt == 0) + break; + } + } + return 0; +} + + int32_t +ha_readv_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) +{ + int ret = 0; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + vector, + count, + stbuf); + } + return 0; +} + +int32_t +ha_readv (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_readv_stub (frame, ha_readv, fd, size, offset); + + STACK_WIND_COOKIE (frame, + ha_readv_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->readv, + fd, + size, + offset); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + + int32_t +ha_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + int ret = 0; + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + stbuf); + } + return 0; +} + +int32_t +ha_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t off) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_writev_stub (frame, ha_writev, fd, vector, count, off); + + STACK_WIND_COOKIE (frame, + ha_writev_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->writev, + fd, + vector, + count, + off); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + + int32_t +ha_flush_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int ret = 0; + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno); + } + return 0; +} + +int32_t +ha_flush (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_flush_stub (frame, ha_flush, fd); + STACK_WIND_COOKIE (frame, + ha_flush_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->flush, + fd); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + + + int32_t +ha_fsync_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int ret = 0; + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno); + } + return 0; +} + +int32_t +ha_fsync (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_fsync_stub (frame, ha_fsync, fd, flags); + STACK_WIND_COOKIE (frame, + ha_fsync_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->fsync, + fd, + flags); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + + int32_t +ha_fstat_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int ret = 0; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + } + return 0; +} + +int32_t +ha_fstat (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_fstat_stub (frame, ha_fstat, fd); + STACK_WIND_COOKIE (frame, + ha_fstat_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->fstat, + fd); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + +int32_t +ha_opendir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + xlator_t **children = NULL; + int i = 0, child_count = 0, callcnt = 0, ret = 0; + call_frame_t *prev_frame = NULL; + hafd_t *hafdp = NULL; + uint64_t tmp_hafdp = 0; + + local = frame->local; + pvt = this->private; + children = pvt->children; + child_count = pvt->child_count; + prev_frame = cookie; + + ret = fd_ctx_get (local->fd, this, &tmp_hafdp); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "dict_ptr_error()"); + } + hafdp = (hafd_t *)(long)tmp_hafdp; + + for (i = 0; i < child_count; i++) + if (children[i] == prev_frame->this) + break; + LOCK (&frame->lock); + if (op_ret != -1) { + hafdp->fdstate[i] = 1; + local->op_ret = 0; + } + if (op_ret == -1 && op_errno != ENOTCONN) + local->op_errno = op_errno; + callcnt = --local->call_count; + UNLOCK (&frame->lock); + + if (callcnt == 0) { + STACK_UNWIND (frame, + local->op_ret, + local->op_errno, + local->fd); + } + return 0; +} + +int32_t +ha_opendir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, fd_t *fd) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + char *stateino = NULL; + xlator_t **children = NULL; + int cnt = 0, i, child_count = 0, ret = 0; + hafd_t *hafdp = NULL; + uint64_t tmp_stateino = 0; + + local = frame->local; + pvt = this->private; + children = pvt->children; + child_count = pvt->child_count; + + local = frame->local = CALLOC (1, sizeof (*local)); + local->op_ret = -1; + local->op_errno = ENOTCONN; + local->fd = fd; + + hafdp = CALLOC (1, sizeof (*hafdp)); + hafdp->fdstate = CALLOC (1, child_count); + hafdp->path = strdup (loc->path); + LOCK_INIT (&hafdp->lock); + fd_ctx_set (fd, this, (uint64_t)(long)hafdp); + ret = inode_ctx_get (loc->inode, this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "inode_ctx_get() error"); + } + for (i = 0; i < child_count; i++) + if (stateino[i]) + cnt++; + local->call_count = cnt; + for (i = 0; i < child_count; i++) { + if (stateino[i]) { + STACK_WIND (frame, + ha_opendir_cbk, + children[i], + children[i]->fops->opendir, + loc, fd); + if (--cnt == 0) + break; + } + } + return 0; +} + + int32_t +ha_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entries, + int32_t count) +{ + int ret = 0; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + entries, + count); + } + return 0; +} + +int32_t +ha_getdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset, + int32_t flag) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_getdents_stub (frame, ha_getdents, fd, size, offset, flag); + STACK_WIND_COOKIE (frame, + ha_getdents_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->getdents, + fd, + size, + offset, + flag); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL, 0); + return 0; +} + + int32_t +ha_setdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int ret = 0; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno); + } + return 0; +} + +int32_t +ha_setdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags, + dir_entry_t *entries, + int32_t count) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + + local->stub = fop_setdents_stub (frame, ha_setdents, fd, flags, entries, count); + + STACK_WIND_COOKIE (frame, + ha_setdents_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->setdents, + fd, + flags, + entries, + count); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + + int32_t +ha_fsyncdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int ret = 0; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno); + } + return 0; +} + +int32_t +ha_fsyncdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_fsyncdir_stub (frame, ha_fsyncdir, fd, flags); + STACK_WIND_COOKIE (frame, + ha_fsyncdir_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->fsyncdir, + fd, + flags); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + + + int32_t +ha_statfs_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct statvfs *buf) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + } + return 0; +} + +int32_t +ha_statfs (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + + local->stub = fop_statfs_stub (frame, ha_statfs, loc); + STACK_WIND_COOKIE (frame, + ha_statfs_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->statfs, + loc); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + + int32_t +ha_setxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno); + } + return 0; +} + +int32_t +ha_setxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *dict, + int32_t flags) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_setxattr_stub (frame, ha_setxattr, loc, dict, flags); + STACK_WIND_COOKIE (frame, + ha_setxattr_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->setxattr, + loc, + dict, + flags); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + + int32_t +ha_getxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + dict); + } + return 0; +} + +int32_t +ha_getxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_getxattr_stub (frame, ha_getxattr, loc, name); + STACK_WIND_COOKIE (frame, + ha_getxattr_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->getxattr, + loc, + name); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + +int32_t +ha_xattrop_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + int ret = -1; + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) { + STACK_UNWIND (frame, op_ret, op_errno, dict); + } + return 0; +} + + +int32_t +ha_xattrop (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + + local->stub = fop_xattrop_stub (frame, ha_xattrop, loc, flags, dict); + + STACK_WIND_COOKIE (frame, + ha_xattrop_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->xattrop, + loc, + flags, + dict); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, dict); + return 0; +} + +int32_t +ha_fxattrop_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + int ret = -1; + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) + STACK_UNWIND (frame, op_ret, op_errno, dict); + return 0; +} + +int32_t +ha_fxattrop (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_fxattrop_stub (frame, ha_fxattrop, fd, flags, dict); + + STACK_WIND_COOKIE (frame, + ha_fxattrop_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->fxattrop, + fd, + flags, + dict); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, dict); + return 0; +} + + int32_t +ha_removexattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int ret = -1; + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno); + } + return 0; +} + +int32_t +ha_removexattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + + local->stub = fop_removexattr_stub (frame, ha_removexattr, loc, name); + + STACK_WIND_COOKIE (frame, + ha_removexattr_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->removexattr, + loc, + name); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + +int32_t +ha_lk_setlk_unlck_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *lock) +{ + ha_local_t *local = NULL; + int cnt = 0; + call_stub_t *stub = NULL; + + local = frame->local; + + LOCK (&frame->lock); + cnt = --local->call_count; + if (op_ret == 0) + local->op_ret = 0; + UNLOCK (&frame->lock); + + if (cnt == 0) { + stub = local->stub; + FREE (local->state); + if (stub->args.lk.lock.l_type == F_UNLCK) { + STACK_UNWIND (frame, local->op_ret, local->op_errno, &stub->args.lk.lock); + } else { + STACK_UNWIND (frame, -1, EIO, NULL); + } + call_stub_destroy (stub); + } + return 0; +} + +int32_t +ha_lk_setlk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *lock) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + xlator_t **children = NULL; + int i = 0, cnt = 0, j = 0; + int child_count = 0; + call_frame_t *prev_frame = NULL; + char *state = NULL; + + local = frame->local; + pvt = this->private; + children = pvt->children; + child_count = pvt->child_count; + prev_frame = cookie; + state = local->state; + + if (op_ret == 0) + local->op_ret = 0; + + if ((op_ret == 0) || (op_ret == -1 && op_errno == ENOTCONN)) { + for (i = 0; i < child_count; i++) { + if (prev_frame->this == cookie) + break; + } + i++; + for (; i < child_count; i++) { + if (local->state[i]) + break; + } + if (i == child_count) { + call_stub_t *stub = local->stub; + FREE (local->state); + STACK_UNWIND (frame, 0, op_errno, &stub->args.lk.lock); + call_stub_destroy (stub); + return 0; + } + STACK_WIND (frame, + ha_lk_setlk_cbk, + children[i], + children[i]->fops->lk, + local->stub->args.lk.fd, + local->stub->args.lk.cmd, + &local->stub->args.lk.lock); + return 0; + } else { + for (i = 0; i < child_count; i++) { + if (prev_frame->this == cookie) + break; + } + cnt = 0; + for (j = 0; j < i; j++) { + if (state[i]) + cnt++; + } + if (cnt) { + struct flock lock; + lock = local->stub->args.lk.lock; + for (i = 0; i < child_count; i++) { + if (state[i]) { + STACK_WIND (frame, + ha_lk_setlk_unlck_cbk, + children[i], + children[i]->fops->lk, + local->stub->args.lk.fd, + local->stub->args.lk.cmd, + &lock); + if (--cnt == 0) + break; + } + } + return 0; + } else { + FREE (local->state); + call_stub_destroy (local->stub); + STACK_UNWIND (frame, + op_ret, + op_errno, + lock); + return 0; + } + } +} + +int32_t +ha_lk_getlk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *lock) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + fd_t *fd = NULL; + int child_count = 0, i = 0; + xlator_t **children = NULL; + call_frame_t *prev_frame = NULL; + + local = frame->local; + pvt = this->private; + fd = local->stub->args.lk.fd; + child_count = pvt->child_count; + children = pvt->children; + prev_frame = cookie; + + if (op_ret == 0) { + FREE (local->state); + call_stub_destroy (local->stub); + STACK_UNWIND (frame, 0, 0, lock); + return 0; + } + + for (i = 0; i < child_count; i++) { + if (prev_frame->this == children[i]) + break; + } + + for (; i < child_count; i++) { + if (local->state[i]) + break; + } + + if (i == child_count) { + FREE (local->state); + call_stub_destroy (local->stub); + STACK_UNWIND (frame, op_ret, op_errno, lock); + return 0; + } + + STACK_WIND (frame, + ha_lk_getlk_cbk, + children[i], + children[i]->fops->lk, + fd, + local->stub->args.lk.cmd, + &local->stub->args.lk.lock); + return 0; +} + +int32_t +ha_lk (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct flock *lock) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + hafd_t *hafdp = NULL; + char *state = NULL; + int child_count = 0, i = 0, cnt = 0, ret = 0; + xlator_t **children = NULL; + uint64_t tmp_hafdp = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + children = pvt->children; + ret = fd_ctx_get (fd, this, &tmp_hafdp); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, "fd_ctx_get failed"); + + if (local == NULL) { + local = frame->local = CALLOC (1, sizeof (*local)); + local->active = -1; + local->op_ret = -1; + local->op_errno = ENOTCONN; + } + hafdp = (hafd_t *)(long)tmp_hafdp; + + if (local->active == -1) { + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + + local->stub = fop_lk_stub (frame, ha_lk, fd, cmd, lock); + local->state = CALLOC (1, child_count); + state = hafdp->fdstate; + LOCK (&hafdp->lock); + memcpy (local->state, state, child_count); + UNLOCK (&hafdp->lock); + if (cmd == F_GETLK) { + for (i = 0; i < child_count; i++) { + if (local->state[i]) + break; + } + STACK_WIND (frame, + ha_lk_getlk_cbk, + children[i], + children[i]->fops->lk, + fd, + cmd, + lock); + } else if (cmd == F_SETLK && lock->l_type == F_UNLCK) { + for (i = 0; i < child_count; i++) { + if (local->state[i]) + local->call_count++; + } + cnt = local->call_count; + for (i = 0; i < child_count; i++) { + if (local->state[i]) { + STACK_WIND (frame, + ha_lk_setlk_unlck_cbk, + children[i], + children[i]->fops->lk, + fd, cmd, lock); + if (--cnt == 0) + break; + } + } + } else { + for (i = 0; i < child_count; i++) { + if (local->state[i]) + break; + } + STACK_WIND (frame, + ha_lk_setlk_cbk, + children[i], + children[i]->fops->lk, + fd, + cmd, + lock); + } + return 0; +} + + int32_t +ha_inode_entry_lk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno); + } + return 0; +} + +int32_t +ha_inodelk (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t cmd, + struct flock *lock) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_inodelk_stub (frame, ha_inodelk, loc, cmd, lock); + STACK_WIND_COOKIE (frame, + ha_inode_entry_lk_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->inodelk, + loc, + cmd, + lock); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + +int32_t +ha_entrylk (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *basename, + entrylk_cmd cmd, + entrylk_type type) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_entrylk_stub (frame, ha_entrylk, loc, basename, cmd, type); + STACK_WIND_COOKIE (frame, + ha_inode_entry_lk_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->entrylk, + loc, basename, cmd, type); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + + int32_t +ha_checksum_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + uint8_t *file_checksum, + uint8_t *dir_checksum) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + file_checksum, + dir_checksum); + } + return 0; +} + +int32_t +ha_checksum (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flag) +{ + int op_errno = 0; + ha_local_t *local = NULL; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_checksum_stub (frame, ha_checksum, loc, flag); + + STACK_WIND_COOKIE (frame, + ha_checksum_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->checksum, + loc, + flag); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + return 0; +} + +int32_t +ha_readdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + gf_dirent_t *entries) +{ + int ret = 0; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) + STACK_UNWIND (frame, op_ret, op_errno, entries); + return 0; +} + +int32_t +ha_readdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t off) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_readdir_stub (frame, ha_readdir, fd, size, off); + STACK_WIND_COOKIE (frame, + ha_readdir_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->readdir, + fd, size, off); + return 0; +err: + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; +} + +/* Management operations */ + + int32_t +ha_stats_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct xlator_stats *stats) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + int i = 0; + + local = frame->local; + pvt = this->private; + prev_frame = cookie; + children = pvt->children; + + if (op_ret == -1 && op_errno == ENOTCONN) { + for (i = 0; i < pvt->child_count; i++) { + if (prev_frame->this == children[i]) + break; + } + i++; + for (; i < pvt->child_count; i++) { + if (pvt->state[i]) + break; + } + + if (i == pvt->child_count) { + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + STACK_WIND (frame, + ha_stats_cbk, + children[i], + children[i]->mops->stats, + local->flags); + return 0; + } + + STACK_UNWIND (frame, + op_ret, + op_errno, + stats); + return 0; +} + +int32_t +ha_stats (call_frame_t *frame, + xlator_t *this, + int32_t flags) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + xlator_t **children = NULL; + int i = 0; + + local = frame->local = CALLOC (1, sizeof (*local)); + pvt = this->private; + children = pvt->children; + for (i = 0; i < pvt->child_count; i++) { + if (pvt->state[i]) + break; + } + + if (i == pvt->child_count) { + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + local->flags = flags; + + STACK_WIND (frame, + ha_stats_cbk, + children[i], + children[i]->mops->stats, + flags); + return 0; +} + + +int32_t +ha_getspec_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + char *spec_data) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + int i = 0; + + local = frame->local; + pvt = this->private; + prev_frame = cookie; + children = pvt->children; + + if (op_ret == -1 && op_errno == ENOTCONN) { + for (i = 0; i < pvt->child_count; i++) { + if (prev_frame->this == children[i]) + break; + } + i++; + for (; i < pvt->child_count; i++) { + if (pvt->state[i]) + break; + } + + if (i == pvt->child_count) { + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + STACK_WIND (frame, + ha_getspec_cbk, + children[i], + children[i]->mops->getspec, + local->pattern, + local->flags); + return 0; + } + + STACK_UNWIND (frame, + op_ret, + op_errno, + spec_data); + return 0; +} + +int32_t +ha_getspec (call_frame_t *frame, + xlator_t *this, + const char *key, + int32_t flags) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + xlator_t **children = NULL; + int i = 0; + + local = frame->local = CALLOC (1, sizeof (*local)); + pvt = this->private; + children = pvt->children; + + local = frame->local = CALLOC (1, sizeof (*local)); + for (i = 0; i < pvt->child_count; i++) { + if (pvt->state[i]) + break; + } + + if (i == pvt->child_count) { + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + local->flags = flags; + local->pattern = (char *)key; + + STACK_WIND (frame, + ha_getspec_cbk, + children[i], + children[i]->mops->getspec, + key, flags); + return 0; +} + +int32_t +ha_closedir (xlator_t *this, + fd_t *fd) +{ + hafd_t *hafdp = NULL; + int op_errno = 0; + uint64_t tmp_hafdp = 0; + + op_errno = fd_ctx_del (fd, this, &tmp_hafdp); + if (op_errno != 0) { + gf_log (this->name, GF_LOG_ERROR, "fd_ctx_del() error"); + return 0; + } + hafdp = (hafd_t *)(long)tmp_hafdp; + + FREE (hafdp->fdstate); + FREE (hafdp->path); + LOCK_DESTROY (&hafdp->lock); + return 0; +} + +int32_t +ha_close (xlator_t *this, + fd_t *fd) +{ + hafd_t *hafdp = NULL; + int op_errno = 0; + uint64_t tmp_hafdp = 0; + + op_errno = fd_ctx_del (fd, this, &tmp_hafdp); + if (op_errno != 0) { + gf_log (this->name, GF_LOG_ERROR, "fd_ctx_del() error"); + return 0; + } + hafdp = (hafd_t *)(long)tmp_hafdp; + + FREE (hafdp->fdstate); + FREE (hafdp->path); + LOCK_DESTROY (&hafdp->lock); + return 0; +} + +/* notify */ +int32_t +notify (xlator_t *this, + int32_t event, + void *data, + ...) +{ + ha_private_t *pvt = NULL; + int32_t i = 0, upcnt = 0; + + pvt = this->private; + if (pvt == NULL) { + gf_log (this->name, GF_LOG_DEBUG, "got notify before init()"); + return 0; + } + + switch (event) + { + case GF_EVENT_CHILD_DOWN: + { + for (i = 0; i < pvt->child_count; i++) { + if (data == pvt->children[i]) + break; + } + gf_log (this->name, GF_LOG_DEBUG, "GF_EVENT_CHILD_DOWN from %s", pvt->children[i]->name); + pvt->state[i] = 0; + for (i = 0; i < pvt->child_count; i++) { + if (pvt->state[i]) + break; + } + if (i == pvt->child_count) { + default_notify (this, event, data); + } + } + break; + case GF_EVENT_CHILD_UP: + { + for (i = 0; i < pvt->child_count; i++) { + if (data == pvt->children[i]) + break; + } + + gf_log (this->name, GF_LOG_DEBUG, "GF_EVENT_CHILD_UP from %s", pvt->children[i]->name); + + pvt->state[i] = 1; + + for (i = 0; i < pvt->child_count; i++) { + if (pvt->state[i]) + upcnt++; + } + + if (upcnt == 1) { + default_notify (this, event, data); + } + } + break; + + default: + { + default_notify (this, event, data); + } + } + + return 0; +} + +int +init (xlator_t *this) +{ + ha_private_t *pvt = NULL; + xlator_list_t *trav = NULL; + int count = 0, ret = 0; + + if (!this->children) { + gf_log (this->name,GF_LOG_ERROR, + "FATAL: ha should have one or more child defined"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + trav = this->children; + pvt = CALLOC (1, sizeof (ha_private_t)); + + ret = dict_get_int32 (this->options, "preferred-subvolume", + &pvt->pref_subvol); + if (ret < 0) { + pvt->pref_subvol = -1; + } + + trav = this->children; + while (trav) { + count++; + trav = trav->next; + } + + pvt->child_count = count; + pvt->children = CALLOC (count, sizeof (xlator_t*)); + + trav = this->children; + count = 0; + while (trav) { + pvt->children[count] = trav->xlator; + count++; + trav = trav->next; + } + + pvt->state = CALLOC (1, count); + this->private = pvt; + return 0; +} + +void +fini (xlator_t *this) +{ + ha_private_t *priv = NULL; + priv = this->private; + FREE (priv); + return; +} + + +struct xlator_fops fops = { + .lookup = ha_lookup, + .stat = ha_stat, + .readlink = ha_readlink, + .mknod = ha_mknod, + .mkdir = ha_mkdir, + .unlink = ha_unlink, + .rmdir = ha_rmdir, + .symlink = ha_symlink, + .rename = ha_rename, + .link = ha_link, + .chmod = ha_chmod, + .chown = ha_chown, + .truncate = ha_truncate, + .utimens = ha_utimens, + .create = ha_create, + .open = ha_open, + .readv = ha_readv, + .writev = ha_writev, + .statfs = ha_statfs, + .flush = ha_flush, + .fsync = ha_fsync, + .setxattr = ha_setxattr, + .getxattr = ha_getxattr, + .removexattr = ha_removexattr, + .opendir = ha_opendir, + .readdir = ha_readdir, + .getdents = ha_getdents, + .fsyncdir = ha_fsyncdir, + .access = ha_access, + .ftruncate = ha_ftruncate, + .fstat = ha_fstat, + .lk = ha_lk, + .fchmod = ha_fchmod, + .fchown = ha_fchown, + .setdents = ha_setdents, + .lookup_cbk = ha_lookup_cbk, + .checksum = ha_checksum, + .xattrop = ha_xattrop, + .fxattrop = ha_fxattrop +}; + +struct xlator_mops mops = { + .stats = ha_stats, + .getspec = ha_getspec, +}; + +struct xlator_cbks cbks = { + .release = ha_close, + .releasedir = ha_closedir, + .forget = ha_forget, +}; diff --git a/xlators/cluster/ha/src/ha.h b/xlators/cluster/ha/src/ha.h new file mode 100644 index 000000000..77a04f165 --- /dev/null +++ b/xlators/cluster/ha/src/ha.h @@ -0,0 +1,59 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __HA_H_ +#define __HA_H_ + +typedef struct { + call_stub_t *stub; + int32_t op_ret, op_errno; + int32_t active, tries, revalidate, revalidate_error; + int32_t call_count; + char *state, *pattern; + dict_t *dict; + loc_t *loc; + struct stat buf; + fd_t *fd; + inode_t *inode; + int32_t flags; + int32_t first_success; +} ha_local_t; + +typedef struct { + char *state; + xlator_t **children; + int child_count, pref_subvol; +} ha_private_t; + +typedef struct { + char *fdstate; + char *path; + gf_lock_t lock; + int active; +} hafd_t; + +#define HA_ACTIVE_CHILD(this, local) (((ha_private_t *)this->private)->children[local->active]) + +extern int ha_alloc_init_fd (call_frame_t *frame, fd_t *fd); + +extern int ha_handle_cbk (call_frame_t *frame, void *cookie, int op_ret, int op_errno) ; + +extern int ha_alloc_init_inode (call_frame_t *frame, inode_t *inode); + +#endif diff --git a/xlators/cluster/map/Makefile.am b/xlators/cluster/map/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/cluster/map/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/map/src/Makefile.am b/xlators/cluster/map/src/Makefile.am new file mode 100644 index 000000000..44ee4d9ee --- /dev/null +++ b/xlators/cluster/map/src/Makefile.am @@ -0,0 +1,15 @@ +xlator_LTLIBRARIES = map.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +map_la_LDFLAGS = -module -avoidversion + +map_la_SOURCES = map.c map-helper.c +map_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = map.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + diff --git a/xlators/cluster/map/src/map-helper.c b/xlators/cluster/map/src/map-helper.c new file mode 100644 index 000000000..4e51219d4 --- /dev/null +++ b/xlators/cluster/map/src/map-helper.c @@ -0,0 +1,357 @@ +/* + Copyright (c) 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "map.h" + + +xlator_t * +map_subvol_next (xlator_t *this, xlator_t *prev) +{ + map_private_t *priv = NULL; + xlator_t *next = NULL; + int i = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (priv->xlarray[i].xl == prev) { + if ((i + 1) < priv->child_count) + next = priv->xlarray[i + 1].xl; + break; + } + } + + return next; +} + +int +map_subvol_cnt (xlator_t *this, xlator_t *subvol) +{ + int i = 0; + int ret = -1; + map_private_t *priv = NULL; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (subvol == priv->xlarray[i].xl) { + ret = i; + break; + } + } + + return ret; +} + +int +map_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) +{ + map_private_t *priv = NULL; + int cnt = 0; + int max = 0; + uint64_t y = 0; + + if (x == ((uint64_t) -1)) { + y = (uint64_t) -1; + goto out; + } + + priv = this->private; + + max = priv->child_count; + cnt = map_subvol_cnt (this, subvol); + + y = ((x * max) + cnt); + +out: + if (y_p) + *y_p = y; + + return 0; +} + + +int +map_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p, + uint64_t *x_p) +{ + int cnt = 0; + int max = 0; + uint64_t x = 0; + xlator_t *subvol = 0; + map_private_t *priv = NULL; + + priv = this->private; + max = priv->child_count; + + cnt = y % max; + x = y / max; + + subvol = priv->xlarray[cnt].xl; + + if (subvol_p) + *subvol_p = subvol; + + if (x_p) + *x_p = x; + + return 0; +} + + +xlator_t * +get_mapping_subvol_from_path (xlator_t *this, const char *path) +{ + map_private_t *priv = NULL; + struct map_pattern *map = NULL; + + /* To make sure we handle '/' properly */ + if (!strcmp (path, "/")) + return NULL; + + priv = this->private; + + map = priv->map; + while (map) { + if (!strncmp (map->directory, path, map->dir_len)) { + if ((path[map->dir_len] == '/') || + (path[map->dir_len] == '\0')) { + return map->xl; + } + } + + map = map->next; + } + + return priv->default_xl; +} + +xlator_t * +get_mapping_subvol_from_ctx (xlator_t *this, inode_t *inode) +{ + uint64_t subvol = 0; + int ret = -1; + + ret = inode_ctx_get (inode, this, &subvol); + if (ret != 0) + return NULL; + + return (xlator_t *)(long)subvol; +} + +int +check_multiple_volume_entry (xlator_t *this, + xlator_t *subvol) +{ + int ret = -1; + int idx = 0; + map_private_t *priv = NULL; + + priv = this->private; + + for (idx = 0; idx < priv->child_count; idx++) { + if (priv->xlarray[idx].xl == subvol) { + if (priv->xlarray[idx].mapped) { + gf_log (this->name, GF_LOG_ERROR, + "subvolume '%s' is already mapped", + subvol->name); + goto out; + } + priv->xlarray[idx].mapped = 1; + ret = 0; + goto out; + } + } + + gf_log (this->name, GF_LOG_ERROR, + "subvolume '%s' is not found", + subvol->name); + + out: + return ret; +} + +int +verify_dir_and_assign_subvol (xlator_t *this, + const char *directory, + const char *subvol) +{ + int default_flag = 0; + int ret = -1; + int idx = 0; + map_private_t *priv = NULL; + xlator_list_t *trav = NULL; + struct map_pattern *tmp_map = NULL; + + priv = this->private; + + /* check if directory is valid, ie, its a top level dir, and + * not includes a '*' in it. + */ + if (!strcmp ("*", directory)) { + default_flag = 1; + } else { + if (directory[0] != '/') { + gf_log (this->name, GF_LOG_ERROR, + "map takes absolute path, starting with '/'. " + "not '%s'", directory); + goto out; + } + for (idx = 1; idx < (strlen (directory) - 1); idx++) { + if (directory[idx] == '/') { + gf_log (this->name, GF_LOG_ERROR, + "map takes only top level directory, " + "not '%s'", directory); + goto out; + } + } + } + + /* Assign proper subvolume */ + trav = this->children; + while (trav) { + if (!strcmp (trav->xlator->name, subvol)) { + + /* Check if there is another directory for + * same volume, if yes, return error. + */ + ret = check_multiple_volume_entry (this, + trav->xlator); + if (ret != 0) { + goto out; + } + + ret = 0; + if (default_flag) { + if (priv->default_xl) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "'*' specified more than " + "once. don't confuse me!!!"); + } + + priv->default_xl = trav->xlator; + goto out; + } + + tmp_map = CALLOC (1, sizeof (struct map_pattern)); + tmp_map->xl = trav->xlator; + tmp_map->dir_len = strlen (directory); + + /* make sure that the top level directory starts + * with '/' and ends without '/' + */ + tmp_map->directory = strdup (directory); + if (directory[tmp_map->dir_len - 1] == '/') { + tmp_map->dir_len--; + } + + if (!priv->map) + priv->map = tmp_map; + else { + struct map_pattern *trav_map = NULL; + trav_map = priv->map; + while (trav_map->next) + trav_map = trav_map->next; + trav_map->next = tmp_map; + } + + goto out; + } + + trav = trav->next; + } + + gf_log (this->name, GF_LOG_ERROR, + "map volume '%s' is not proper subvolume", subvol); + + out: + return ret; +} + +int +assign_default_subvol (xlator_t *this, const char *default_xl) +{ + int ret = -1; + map_private_t *priv = NULL; + xlator_list_t *trav = NULL; + + priv = this->private; + trav = this->children; + + while (trav) { + if (!strcmp (trav->xlator->name, default_xl)) { + ret = check_multiple_volume_entry (this, + trav->xlator); + if (ret != 0) { + goto out; + } + if (priv->default_xl) + gf_log (this->name, GF_LOG_WARNING, + "default-volume option provided, " + "overriding earlier '*' option"); + priv->default_xl = trav->xlator; + return 0; + } + trav = trav->next; + } + + gf_log (this->name, GF_LOG_ERROR, + "default-volume value is not an valid subvolume. check again"); + out: + return -1; +} + +void +verify_if_all_subvolumes_got_used (xlator_t *this) +{ + int idx = 0; + map_private_t *priv = NULL; + + priv = this->private; + + for (idx = 0; idx < priv->child_count; idx++) { + if (!priv->xlarray[idx].mapped) { + if (!priv->default_xl) { + priv->default_xl = priv->xlarray[idx].xl; + priv->xlarray[idx].mapped = 1; + } else { + gf_log (this->name, GF_LOG_WARNING, + "subvolume '%s' is not mapped to " + "any directory", + priv->xlarray[idx].xl->name); + } + } + } + + if (!priv->default_xl) { + gf_log (this->name, GF_LOG_WARNING, + "default subvolume not specified, filesystem " + "may not work properly. Check 'map' translator " + "documentation for more info"); + } + + return ; +} diff --git a/xlators/cluster/map/src/map.c b/xlators/cluster/map/src/map.c new file mode 100644 index 000000000..8c4b7c83c --- /dev/null +++ b/xlators/cluster/map/src/map.c @@ -0,0 +1,2193 @@ +/* + Copyright (c) 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "map.h" + +/* For <op>_cbk functions */ +#include "defaults.c" + + +int32_t +map_stat (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_stat_cbk, + subvol, + subvol->fops->stat, + loc); + + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int32_t +map_chmod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_chmod_cbk, + subvol, + subvol->fops->chmod, + loc, + mode); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_fchmod (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + mode_t mode) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_fchmod_cbk, + subvol, + subvol->fops->fchmod, + fd, + mode); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_chown (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + uid_t uid, + gid_t gid) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_chown_cbk, + subvol, + subvol->fops->chown, + loc, + uid, + gid); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_fchown (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + uid_t uid, + gid_t gid) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_fchown_cbk, + subvol, + subvol->fops->fchown, + fd, + uid, + gid); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_truncate_cbk, + subvol, + subvol->fops->truncate, + loc, + offset); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_ftruncate (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_ftruncate_cbk, + subvol, + subvol->fops->ftruncate, + fd, + offset); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_utimens (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec tv[2]) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_utimens_cbk, + subvol, + subvol->fops->utimens, + loc, + tv); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_access (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t mask) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_access_cbk, + subvol, + subvol->fops->access, + loc, + mask); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int32_t +map_readlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + size_t size) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_readlink_cbk, + subvol, + subvol->fops->readlink, + loc, + size); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_unlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_unlink_cbk, + subvol, + subvol->fops->unlink, + loc); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_rmdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_rmdir_cbk, + subvol, + subvol->fops->rmdir, + loc); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int32_t +map_rename (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + int32_t op_errno = 1; + xlator_t *old_subvol = NULL; + xlator_t *new_subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (oldloc, err); + VALIDATE_OR_GOTO (oldloc->inode, err); + VALIDATE_OR_GOTO (oldloc->path, err); + VALIDATE_OR_GOTO (newloc, err); + + old_subvol = get_mapping_subvol_from_ctx (this, oldloc->inode); + if (!old_subvol) { + op_errno = EINVAL; + goto err; + } + + if (newloc->path) { + new_subvol = get_mapping_subvol_from_path (this, + newloc->path); + if (new_subvol && (new_subvol != old_subvol)) { + op_errno = EXDEV; + goto err; + } + } + + STACK_WIND (frame, + default_rename_cbk, + old_subvol, + old_subvol->fops->rename, + oldloc, newloc); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int32_t +map_link (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + int32_t op_errno = 1; + xlator_t *old_subvol = NULL; + xlator_t *new_subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (oldloc, err); + VALIDATE_OR_GOTO (oldloc->inode, err); + VALIDATE_OR_GOTO (oldloc->path, err); + VALIDATE_OR_GOTO (newloc, err); + + old_subvol = get_mapping_subvol_from_ctx (this, oldloc->inode); + if (!old_subvol) { + op_errno = EINVAL; + goto err; + } + + if (newloc->path) { + new_subvol = get_mapping_subvol_from_path (this, + newloc->path); + if (new_subvol && (new_subvol != old_subvol)) { + op_errno = EXDEV; + goto err; + } + } + + STACK_WIND (frame, + default_link_cbk, + old_subvol, + old_subvol->fops->link, + oldloc, newloc); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int32_t +map_open (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, fd_t *fd) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_open_cbk, + subvol, + subvol->fops->open, + loc, flags, fd); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_readv (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_readv_cbk, + subvol, + subvol->fops->readv, + fd, + size, + offset); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t off) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_writev_cbk, + subvol, + subvol->fops->writev, + fd, + vector, + count, + off); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_flush (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_flush_cbk, + subvol, + subvol->fops->flush, + fd); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int32_t +map_fsync (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_fsync_cbk, + subvol, + subvol->fops->fsync, + fd, + flags); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_fstat (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_fstat_cbk, + subvol, + subvol->fops->fstat, + fd); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_getdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset, + int32_t flag) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_getdents_cbk, + subvol, + subvol->fops->getdents, + fd, + size, + offset, + flag); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_setdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags, + dir_entry_t *entries, + int32_t count) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_setdents_cbk, + subvol, + subvol->fops->setdents, + fd, + flags, + entries, + count); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int32_t +map_fsyncdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_fsyncdir_cbk, + subvol, + subvol->fops->fsyncdir, + fd, + flags); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + + + +int32_t +map_setxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *dict, + int32_t flags) +{ + /* TODO: support for 'get' 'put' API */ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_setxattr_cbk, + subvol, + subvol->fops->setxattr, + loc, + dict, + flags); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_getxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + /* TODO: support for 'get' 'put' API */ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_getxattr_cbk, + subvol, + subvol->fops->getxattr, + loc, + name); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_xattrop (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_xattrop_cbk, + subvol, + subvol->fops->xattrop, + loc, + flags, + dict); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_fxattrop (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_fxattrop_cbk, + subvol, + subvol->fops->fxattrop, + fd, + flags, + dict); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_removexattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_removexattr_cbk, + subvol, + subvol->fops->removexattr, + loc, + name); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_lk (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct flock *lock) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_lk_cbk, + subvol, + subvol->fops->lk, + fd, + cmd, + lock); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int32_t +map_inodelk (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t cmd, struct flock *lock) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_inodelk_cbk, + subvol, + subvol->fops->inodelk, + loc, cmd, lock); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int32_t +map_finodelk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t cmd, struct flock *lock) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_finodelk_cbk, + subvol, + subvol->fops->finodelk, + fd, cmd, lock); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_entrylk (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, default_entrylk_cbk, + subvol, + subvol->fops->entrylk, + loc, basename, cmd, type); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_fentrylk (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, default_fentrylk_cbk, + subvol, + subvol->fops->fentrylk, + fd, basename, cmd, type); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_checksum (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flag) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_checksum_cbk, + subvol, + subvol->fops->checksum, + loc, + flag); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +static int32_t +map_newentry_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + call_frame_t *prev = NULL; + prev = cookie; + + map_itransform (this, prev->this, buf->st_ino, &buf->st_ino); + + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; + +} + + +int32_t +map_mknod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode, + dev_t rdev) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO (loc->inode, err); + + subvol = get_mapping_subvol_from_path (this, loc->path); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol); + if (op_errno != 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set subvolume ptr in inode ctx", + loc->path); + } + + STACK_WIND (frame, + map_newentry_cbk, + subvol, + subvol->fops->mknod, + loc, mode, rdev); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_mkdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO (loc->inode, err); + + subvol = get_mapping_subvol_from_path (this, loc->path); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol); + if (op_errno != 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set subvolume ptr in inode ctx", + loc->path); + } + + STACK_WIND (frame, + map_newentry_cbk, + subvol, + subvol->fops->mkdir, + loc, mode); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int32_t +map_symlink (call_frame_t *frame, + xlator_t *this, + const char *linkpath, + loc_t *loc) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO (loc->inode, err); + + subvol = get_mapping_subvol_from_path (this, loc->path); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol); + if (op_errno != 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set subvolume ptr in inode ctx", + loc->path); + } + + STACK_WIND (frame, + map_newentry_cbk, + subvol, + subvol->fops->symlink, + linkpath, loc); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +static int32_t +map_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf) +{ + call_frame_t *prev = NULL; + prev = cookie; + + map_itransform (this, prev->this, buf->st_ino, &buf->st_ino); + STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); + return 0; +} + +int32_t +map_create (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + mode_t mode, fd_t *fd) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO (loc->inode, err); + + subvol = get_mapping_subvol_from_path (this, loc->path); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol); + if (op_errno != 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set subvolume ptr in inode ctx", + loc->path); + } + + STACK_WIND (frame, map_create_cbk, + subvol, + subvol->fops->create, + loc, flags, mode, fd); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_single_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + call_frame_t *prev = NULL; + prev = cookie; + + map_itransform (this, prev->this, buf->st_ino, &buf->st_ino); + + STACK_UNWIND (frame, op_ret, op_errno, inode, buf, dict); + + return 0; +} + +int32_t +map_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + int callcnt = 0; + map_local_t *local = NULL; + inode_t *tmp_inode = NULL; + dict_t *tmp_dict = NULL; + + local = frame->local; + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if ((op_ret == 0) && (local->op_ret == -1)) { + local->op_ret = 0; + local->stbuf = *buf; + if (dict) + local->dict = dict_ref (dict); + local->inode = inode_ref (inode); + } + if (op_ret == -1) + local->op_errno = op_errno; + + } + UNLOCK (&frame->lock); + + if (!callcnt) { + tmp_dict = local->dict; + tmp_inode = local->inode; + + STACK_UNWIND (frame, local->op_ret, + local->op_errno, local->inode, + &local->stbuf, local->dict); + + inode_unref (local->inode); + if (tmp_dict) + dict_unref (tmp_dict); + } + + return 0; +} + +int32_t +map_lookup (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req) +{ + int32_t op_errno = EINVAL; + xlator_t *subvol = NULL; + map_local_t *local = NULL; + map_private_t *priv = NULL; + xlator_list_t *trav = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO (loc->inode, err); + + priv = this->private; + + if (loc->inode->ino == 1) + goto root_inode; + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + subvol = get_mapping_subvol_from_path (this, loc->path); + if (!subvol) { + goto err; + } + + op_errno = inode_ctx_put (loc->inode, this, + (uint64_t)(long)subvol); + if (op_errno != 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set subvolume in inode ctx", + loc->path); + } + } + + /* Just one callback */ + STACK_WIND (frame, + map_single_lookup_cbk, + subvol, + subvol->fops->lookup, + loc, + xattr_req); + + return 0; + + root_inode: + local = CALLOC (1, sizeof (map_local_t)); + + frame->local = local; + local->call_count = priv->child_count; + local->op_ret = -1; + + trav = this->children; + while (trav) { + STACK_WIND (frame, + map_lookup_cbk, + trav->xlator, + trav->xlator->fops->lookup, + loc, + xattr_req); + trav = trav->next; + } + + return 0; + + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} +/* + * unify_normalize_stats - + */ +void +map_normalize_stats (struct statvfs *buf, + unsigned long bsize, + unsigned long frsize) +{ + double factor; + + if (buf->f_bsize != bsize) { + factor = ((double) buf->f_bsize) / bsize; + buf->f_bsize = bsize; + buf->f_bfree = (fsblkcnt_t) (factor * buf->f_bfree); + buf->f_bavail = (fsblkcnt_t) (factor * buf->f_bavail); + } + + if (buf->f_frsize != frsize) { + factor = ((double) buf->f_frsize) / frsize; + buf->f_frsize = frsize; + buf->f_blocks = (fsblkcnt_t) (factor * buf->f_blocks); + } +} + + +int32_t +map_statfs_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct statvfs *stbuf) +{ + struct statvfs *dict_buf = NULL; + map_local_t *local = NULL; + int this_call_cnt = 0; + unsigned long bsize; + unsigned long frsize; + + local = frame->local; + + LOCK (&frame->lock); + { + this_call_cnt = --local->call_count; + + if (op_ret == -1) { + local->op_errno = op_errno; + goto unlock; + } + local->op_ret = 0; + + /* when a call is successfull, add it to local->dict */ + dict_buf = &local->statvfs; + + if (dict_buf->f_bsize != 0) { + bsize = max (dict_buf->f_bsize, + stbuf->f_bsize); + + frsize = max (dict_buf->f_frsize, + stbuf->f_frsize); + map_normalize_stats(dict_buf, bsize, frsize); + map_normalize_stats(stbuf, bsize, frsize); + } else { + dict_buf->f_bsize = stbuf->f_bsize; + dict_buf->f_frsize = stbuf->f_frsize; + } + + dict_buf->f_blocks += stbuf->f_blocks; + dict_buf->f_bfree += stbuf->f_bfree; + dict_buf->f_bavail += stbuf->f_bavail; + dict_buf->f_files += stbuf->f_files; + dict_buf->f_ffree += stbuf->f_ffree; + dict_buf->f_favail += stbuf->f_favail; + dict_buf->f_fsid = stbuf->f_fsid; + dict_buf->f_flag = stbuf->f_flag; + dict_buf->f_namemax = stbuf->f_namemax; + } +unlock: + UNLOCK (&frame->lock); + + if (!this_call_cnt) { + STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->statvfs); + } + + return 0; +} + +int32_t +map_statfs (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + int32_t op_errno = EINVAL; + xlator_t *subvol = NULL; + map_local_t *local = NULL; + map_private_t *priv = NULL; + xlator_list_t *trav = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO (loc->inode, err); + + if (loc->inode->ino == 1) + goto root_inode; + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + goto err; + } + + /* Just one callback */ + STACK_WIND (frame, + default_statfs_cbk, + subvol, + subvol->fops->statfs, + loc); + + return 0; + + root_inode: + local = CALLOC (1, sizeof (map_local_t)); + + priv = this->private; + frame->local = local; + local->call_count = priv->child_count; + local->op_ret = -1; + + trav = this->children; + while (trav) { + STACK_WIND (frame, + map_statfs_cbk, + trav->xlator, + trav->xlator->fops->statfs, + loc); + trav = trav->next; + } + + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + +int32_t +map_opendir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + int callcnt = 0; + map_local_t *local = NULL; + fd_t *local_fd = NULL; + + local = frame->local; + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + local->op_errno = op_errno; + goto unlock; + } + + local->op_ret = 0; + } + unlock: + UNLOCK (&frame->lock); + + if (!callcnt) { + local_fd = local->fd; + local->fd = NULL; + + STACK_UNWIND (frame, local->op_ret, + local->op_errno, local_fd); + + fd_unref (local_fd); + } + return 0; +} + + +int32_t +map_opendir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, fd_t *fd) +{ + int32_t op_errno = EINVAL; + xlator_t *subvol = NULL; + map_local_t *local = NULL; + map_private_t *priv = NULL; + xlator_list_t *trav = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + if (loc->inode->ino == 1) + goto root_inode; + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + goto err; + } + + /* Just one callback */ + STACK_WIND (frame, + default_opendir_cbk, + subvol, + subvol->fops->opendir, + loc, fd); + return 0; + + root_inode: + local = CALLOC (1, sizeof (map_local_t)); + + priv = this->private; + frame->local = local; + local->call_count = priv->child_count; + local->op_ret = -1; + local->fd = fd_ref (fd); + + trav = this->children; + while (trav) { + STACK_WIND (frame, + map_opendir_cbk, + trav->xlator, + trav->xlator->fops->opendir, + loc, fd); + trav = trav->next; + } + + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int32_t +map_single_readdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + gf_dirent_t *entries) +{ + call_frame_t *prev = NULL; + gf_dirent_t *orig_entry = NULL; + + prev = cookie; + + list_for_each_entry (orig_entry, &entries->list, list) { + map_itransform (this, prev->this, orig_entry->d_ino, + &orig_entry->d_ino); + } + STACK_UNWIND (frame, op_ret, op_errno, entries); + + return 0; +} + + +int +map_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *orig_entries) +{ + map_local_t *local = NULL; + gf_dirent_t entries; + gf_dirent_t *orig_entry = NULL; + gf_dirent_t *entry = NULL; + call_frame_t *prev = NULL; + xlator_t *subvol = NULL; + xlator_t *next = NULL; + int count = 0; + fd_t *local_fd = NULL; + + INIT_LIST_HEAD (&entries.list); + prev = cookie; + local = frame->local; + + if (op_ret < 0) + goto done; + + list_for_each_entry (orig_entry, &orig_entries->list, list) { + subvol = prev->this; + + entry = gf_dirent_for_name (orig_entry->d_name); + if (!entry) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto unwind; + } + + map_itransform (this, subvol, orig_entry->d_ino, + &entry->d_ino); + map_itransform (this, subvol, orig_entry->d_off, + &entry->d_off); + + entry->d_type = orig_entry->d_type; + entry->d_len = orig_entry->d_len; + + list_add_tail (&entry->list, &entries.list); + count++; + } + + op_ret = count; + +done: + if (count == 0) { + next = map_subvol_next (this, prev->this); + if (!next) { + goto unwind; + } + + STACK_WIND (frame, map_readdir_cbk, + next, next->fops->readdir, + local->fd, local->size, 0); + return 0; + } + +unwind: + if (op_ret < 0) + op_ret = 0; + + local_fd = local->fd; + local->fd = NULL; + + STACK_UNWIND (frame, op_ret, op_errno, &entries); + + fd_unref (local_fd); + + gf_dirent_free (&entries); + + return 0; +} + + +int32_t +map_readdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t yoff) +{ + int32_t op_errno = EINVAL; + xlator_t *subvol = NULL; + map_local_t *local = NULL; + map_private_t *priv = NULL; + xlator_t *xvol = NULL; + off_t xoff = 0; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + if (fd->inode->ino == 1) + goto root_inode; + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + goto err; + } + + /* Just one callback */ + + STACK_WIND (frame, + map_single_readdir_cbk, + subvol, + subvol->fops->readdir, + fd, size, yoff); + return 0; + + root_inode: + /* readdir on '/' */ + local = CALLOC (1, sizeof (map_local_t)); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + priv = this->private; + frame->local = local; + local->op_errno = ENOENT; + local->op_ret = -1; + + local->fd = fd_ref (fd); + local->size = size; + + map_deitransform (this, yoff, &xvol, (uint64_t *)&xoff); + + STACK_WIND (frame, map_readdir_cbk, + xvol, xvol->fops->readdir, + fd, size, xoff); + + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +#if 0 +/* TODO : do it later as currently only unify uses this mop and mostly + unify will be used below map */ +int32_t +map_stats_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct xlator_stats *stats) +{ + STACK_UNWIND (frame, op_ret, op_errno, stats); + return 0; +} + + +int32_t +map_stats (call_frame_t *frame, + xlator_t *this, + int32_t flags) +{ + STACK_WIND (frame, + map_stats_cbk, + subvol, + subvol->mops->stats, + flags); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} +#endif /* if 0 */ + + +/* TODO: define the behavior of notify */ + + +void +fini (xlator_t *this) +{ + map_private_t *priv = NULL; + struct map_pattern *trav_map = NULL; + struct map_pattern *tmp_map = NULL; + + priv = this->private; + + if (priv) { + if (priv->xlarray) + FREE (priv->xlarray); + + trav_map = priv->map; + while (trav_map) { + tmp_map = trav_map; + trav_map = trav_map->next; + FREE (tmp_map); + } + + FREE(priv); + } + + return; +} + +int +init (xlator_t *this) +{ + map_private_t *priv = NULL; + xlator_list_t *trav = NULL; + int count = 0; + int ret = -1; + char *pattern_string = NULL; + char *map_pair_str = NULL; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *dup_map_pair = NULL; + char *dir_str = NULL; + char *subvol_str = NULL; + char *default_xl = NULL; + + if (!this->children) { + gf_log (this->name,GF_LOG_ERROR, + "FATAL: map should have one or more child defined"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + priv = CALLOC (1, sizeof (map_private_t)); + this->private = priv; + + /* allocate xlator array */ + trav = this->children; + while (trav) { + count++; + trav = trav->next; + } + priv->xlarray = CALLOC (1, sizeof (struct map_xlator_array) * count); + priv->child_count = count; + + /* build xlator array */ + count = 0; + trav = this->children; + while (trav) { + priv->xlarray[count++].xl = trav->xlator; + trav = trav->next; + } + + /* map dir1:brick1;dir2:brick2;dir3:brick3;*:brick4 */ + ret = dict_get_str (this->options, "map-directory", &pattern_string); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "map.pattern not given, can't continue"); + goto err; + } + map_pair_str = strtok_r (pattern_string, ";", &tmp_str); + while (map_pair_str) { + dup_map_pair = strdup (map_pair_str); + dir_str = strtok_r (dup_map_pair, ":", &tmp_str1); + if (!dir_str) { + gf_log (this->name, GF_LOG_ERROR, + "directory string invalid"); + goto err; + } + subvol_str = strtok_r (NULL, ":", &tmp_str1); + if (!subvol_str) { + gf_log (this->name, GF_LOG_ERROR, + "mapping subvolume string invalid"); + goto err; + } + ret = verify_dir_and_assign_subvol (this, + dir_str, + subvol_str); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "verification failed"); + goto err; + } + + FREE (dup_map_pair); + + map_pair_str = strtok_r (NULL, ";", &tmp_str); + } + + /* default-volume brick4 */ + ret = dict_get_str (this->options, "default-volume", &default_xl); + if (ret == 0) { + ret = assign_default_subvol (this, default_xl); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "assigning default failed"); + goto err; + } + } + + verify_if_all_subvolumes_got_used (this); + + return 0; + err: + fini (this); + return -1; +} + + +struct xlator_fops fops = { + .lookup = map_lookup, + .mknod = map_mknod, + .create = map_create, + + .stat = map_stat, + .chmod = map_chmod, + .chown = map_chown, + .fchown = map_fchown, + .fchmod = map_fchmod, + .fstat = map_fstat, + .utimens = map_utimens, + .truncate = map_truncate, + .ftruncate = map_ftruncate, + .access = map_access, + .readlink = map_readlink, + .setxattr = map_setxattr, + .getxattr = map_getxattr, + .removexattr = map_removexattr, + .open = map_open, + .readv = map_readv, + .writev = map_writev, + .flush = map_flush, + .fsync = map_fsync, + .statfs = map_statfs, + .lk = map_lk, + .opendir = map_opendir, + .readdir = map_readdir, + .fsyncdir = map_fsyncdir, + .symlink = map_symlink, + .unlink = map_unlink, + .link = map_link, + .mkdir = map_mkdir, + .rmdir = map_rmdir, + .rename = map_rename, + .inodelk = map_inodelk, + .finodelk = map_finodelk, + .entrylk = map_entrylk, + .fentrylk = map_fentrylk, + .xattrop = map_xattrop, + .fxattrop = map_fxattrop, + .setdents = map_setdents, + .getdents = map_getdents, + .checksum = map_checksum, +}; + +struct xlator_mops mops = { +}; + +struct xlator_cbks cbks = { +}; + +struct volume_options options[] = { + { .key = {"map-directory"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"default-volume"}, + .type = GF_OPTION_TYPE_XLATOR + }, + + { .key = {NULL} } +}; diff --git a/xlators/cluster/map/src/map.h b/xlators/cluster/map/src/map.h new file mode 100644 index 000000000..0f1aabfd6 --- /dev/null +++ b/xlators/cluster/map/src/map.h @@ -0,0 +1,76 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __MAP_H__ +#define __MAP_H__ + +#include "xlator.h" + +struct map_pattern { + struct map_pattern *next; + xlator_t *xl; + char *directory; + int dir_len; +}; + +struct map_xlator_array { + xlator_t *xl; + int mapped; /* yes/no */ +}; + +typedef struct { + struct map_pattern *map; + xlator_t *default_xl; + struct map_xlator_array *xlarray; + int child_count; +} map_private_t; + +typedef struct { + int32_t op_ret; + int32_t op_errno; + int call_count; + struct statvfs statvfs; + struct stat stbuf; + inode_t *inode; + dict_t *dict; + fd_t *fd; + + size_t size; +} map_local_t; + +xlator_t *map_subvol_next (xlator_t *this, xlator_t *prev); +int map_subvol_cnt (xlator_t *this, xlator_t *subvol); + +int map_itransform (xlator_t *this, xlator_t *subvol, + uint64_t x, uint64_t *y_p); +int map_deitransform (xlator_t *this, uint64_t y, + xlator_t **subvol_p, uint64_t *x_p); + + +xlator_t *get_mapping_subvol_from_path (xlator_t *this, const char *path); +xlator_t *get_mapping_subvol_from_ctx (xlator_t *this, inode_t *inode); + +int check_multiple_volume_entry (xlator_t *this, xlator_t *subvol); +int verify_dir_and_assign_subvol (xlator_t *this, + const char *directory, const char *subvol); +int assign_default_subvol (xlator_t *this, const char *default_xl); +void verify_if_all_subvolumes_got_used (xlator_t *this); + + +#endif /* __MAP_H__ */ diff --git a/xlators/cluster/stripe/Makefile.am b/xlators/cluster/stripe/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/cluster/stripe/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/stripe/src/Makefile.am b/xlators/cluster/stripe/src/Makefile.am new file mode 100644 index 000000000..60e0a1568 --- /dev/null +++ b/xlators/cluster/stripe/src/Makefile.am @@ -0,0 +1,14 @@ + +xlator_LTLIBRARIES = stripe.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +stripe_la_LDFLAGS = -module -avoidversion + +stripe_la_SOURCES = stripe.c +stripe_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c new file mode 100644 index 000000000..83787ca2a --- /dev/null +++ b/xlators/cluster/stripe/src/stripe.c @@ -0,0 +1,3286 @@ +/* + Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +/** + * xlators/cluster/stripe: + * Stripe translator, stripes the data accross its child nodes, + * as per the options given in the volfile. The striping works + * fairly simple. It writes files at different offset as per + * calculation. So, 'ls -l' output at the real posix level will + * show file size bigger than the actual size. But when one does + * 'df' or 'du <file>', real size of the file on the server is shown. + * + * WARNING: + * Stripe translator can't regenerate data if a child node gets disconnected. + * So, no 'self-heal' for stripe. Hence the advice, use stripe only when its + * very much necessary, or else, use it in combination with AFR, to have a + * backup copy. + */ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "logging.h" +#include "defaults.h" +#include "compat.h" +#include "compat-errno.h" +#include <fnmatch.h> +#include <signal.h> + +#define STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR(_loc) do { \ + if (!(_loc && _loc->inode)) { \ + STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL); \ + return 0; \ + } \ +} while(0) + +/** + * struct stripe_options : This keeps the pattern and the block-size + * information, which is used for striping on a file. + */ +struct stripe_options { + struct stripe_options *next; + char path_pattern[256]; + uint64_t block_size; +}; + +/** + * Private structure for stripe translator + */ +struct stripe_private { + struct stripe_options *pattern; + xlator_t **xl_array; + uint64_t block_size; + gf_lock_t lock; + uint8_t nodes_down; + int8_t first_child_down; + int8_t child_count; + int8_t state[256]; /* Current state of the child node, + 0 for down, 1 for up */ + gf_boolean_t xattr_supported; /* 0 for no, 1 for yes, default yes */ +}; + +/** + * Used to keep info about the replies received from fops->readv calls + */ +struct readv_replies { + struct iovec *vector; + int32_t count; //count of vector + int32_t op_ret; //op_ret of readv + int32_t op_errno; + struct stat stbuf; /* 'stbuf' is also a part of reply */ +}; + +/** + * Local structure to be passed with all the frames in case of STACK_WIND + */ +struct stripe_local; /* this itself is used inside the structure; */ + +struct stripe_local { + struct stripe_local *next; + call_frame_t *orig_frame; + + /* Used by _cbk functions */ + struct stat stbuf; + struct readv_replies *replies; + struct statvfs statvfs_buf; + dir_entry_t *entry; + struct xlator_stats stats; + + int8_t revalidate; + int8_t failed; + int8_t unwind; + + int32_t node_index; + int32_t call_count; + int32_t wind_count; /* used instead of child_cound + in case of read and write */ + int32_t op_ret; + int32_t op_errno; + int32_t count; + int32_t flags; + char *name; + inode_t *inode; + + loc_t loc; + loc_t loc2; + + /* For File I/O fops */ + dict_t *dict; + + /* General usage */ + off_t offset; + off_t stripe_size; + + int8_t *list; + struct flock lock; + fd_t *fd; + void *value; +}; + +typedef struct stripe_local stripe_local_t; +typedef struct stripe_private stripe_private_t; + +/** + * stripe_get_matching_bs - Get the matching block size for the given path. + */ +int32_t +stripe_get_matching_bs (const char *path, + struct stripe_options *opts, + uint64_t default_bs) +{ + struct stripe_options *trav = NULL; + char *pathname = NULL; + uint64_t block_size = 0; + + block_size = default_bs; + pathname = strdup (path); + trav = opts; + + while (trav) { + if (fnmatch (trav->path_pattern, + pathname, FNM_NOESCAPE) == 0) { + block_size = trav->block_size; + break; + } + trav = trav->next; + } + free (pathname); + + return block_size; +} + + +/* + * stripe_common_cbk - + */ +int32_t +stripe_common_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +/** + * stripe_stack_unwind_cbk - This function is used for all the _cbk without + * any extra arguments (other than the minimum given) + * This is called from functions like fsync,unlink,rmdir etc. + * + */ +int32_t +stripe_stack_unwind_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_errno = op_errno; + if (op_errno == ENOTCONN) + local->failed = 1; + } + if (op_ret >= 0) + local->op_ret = op_ret; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + if (local->loc.path) + loc_wipe (&local->loc); + if (local->loc2.path) + loc_wipe (&local->loc2); + + STACK_UNWIND (frame, local->op_ret, local->op_errno); + } + return 0; +} + +int32_t +stripe_common_buf_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +/** + * stripe_stack_unwind_buf_cbk - This function is used for all the _cbk with + * 'struct stat *buf' as extra argument (other than minimum) + * This is called from functions like, chmod, fchmod, chown, fchown, + * truncate, ftruncate, utimens etc. + * + * @cookie - this argument should be always 'xlator_t *' of child node + */ +int32_t +stripe_stack_unwind_buf_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_errno = op_errno; + if (op_errno == ENOTCONN) + local->failed = 1; + } + + if (op_ret == 0) { + local->op_ret = 0; + if (local->stbuf.st_blksize == 0) { + local->stbuf = *buf; + /* Because st_blocks gets added again */ + local->stbuf.st_blocks = 0; + } + + if (FIRST_CHILD(this) == + ((call_frame_t *)cookie)->this) { + /* Always, pass the inode number of + first child to the above layer */ + local->stbuf.st_ino = buf->st_ino; + local->stbuf.st_mtime = buf->st_mtime; + } + + local->stbuf.st_blocks += buf->st_blocks; + if (local->stbuf.st_size < buf->st_size) + local->stbuf.st_size = buf->st_size; + if (local->stbuf.st_blksize != buf->st_blksize) { + /* TODO: add to blocks in terms of + original block size */ + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + if (local->loc.path) + loc_wipe (&local->loc); + if (local->loc2.path) + loc_wipe (&local->loc2); + + STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + } + + return 0; +} + +/* In case of symlink, mknod, the file is created on just first node */ +int32_t +stripe_common_inode_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + +/** + * stripe_stack_unwind_inode_cbk - This is called by the function like, + * link (), symlink (), mkdir (), mknod () + * This creates a inode for new inode. It keeps a list of all + * the inodes received from the child nodes. It is used while + * forwarding any fops to child nodes. + * + */ +int32_t +stripe_stack_unwind_inode_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_errno = op_errno; + if (op_errno == ENOTCONN) + local->failed = 1; + } + + if (op_ret >= 0) { + local->op_ret = 0; + + if (local->stbuf.st_blksize == 0) { + local->inode = inode; + local->stbuf = *buf; + /* Because st_blocks gets added again */ + local->stbuf.st_blocks = 0; + } + if (FIRST_CHILD(this) == + ((call_frame_t *)cookie)->this) { + local->stbuf.st_ino = buf->st_ino; + local->stbuf.st_mtime = buf->st_mtime; + } + + local->stbuf.st_blocks += buf->st_blocks; + if (local->stbuf.st_size < buf->st_size) + local->stbuf.st_size = buf->st_size; + if (local->stbuf.st_blksize != buf->st_blksize) { + /* TODO: add to blocks in terms of + original block size */ + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf); + } + + return 0; +} + +int32_t +stripe_stack_unwind_inode_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + int32_t callcnt = 0; + dict_t *tmp_dict = NULL; + stripe_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + if (op_errno != ENOENT) + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_errno = op_errno; + if (op_errno == ENOTCONN) + local->failed = 1; + } + + if (op_ret >= 0) { + local->op_ret = 0; + + if (local->stbuf.st_blksize == 0) { + local->inode = inode; + local->stbuf = *buf; + /* Because st_blocks gets added again */ + local->stbuf.st_blocks = 0; + } + if (FIRST_CHILD(this) == + ((call_frame_t *)cookie)->this) { + local->stbuf.st_ino = buf->st_ino; + local->stbuf.st_mtime = buf->st_mtime; + if (local->dict) + dict_unref (local->dict); + local->dict = dict_ref (dict); + } else { + if (!local->dict) + local->dict = dict_ref (dict); + } + local->stbuf.st_blocks += buf->st_blocks; + if (local->stbuf.st_size < buf->st_size) + local->stbuf.st_size = buf->st_size; + if (local->stbuf.st_blksize != buf->st_blksize) { + /* TODO: add to blocks in terms of + original block size */ + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + tmp_dict = local->dict; + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->dict); + if (tmp_dict) + dict_unref (tmp_dict); + } + + return 0; +} + + +/** + * stripe_lookup - + */ +int32_t +stripe_lookup (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req) +{ + stripe_local_t *local = NULL; + xlator_list_t *trav = NULL; + stripe_private_t *priv = this->private; + char send_lookup_to_all = 0; + + if (!(loc && loc->inode)) { + gf_log (this->name, GF_LOG_ERROR, + "wrong argument, returning EINVAL"); + STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL); + return 0; + } + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + + if ((!loc->inode->st_mode) || + S_ISDIR (loc->inode->st_mode) || + S_ISREG (loc->inode->st_mode)) + send_lookup_to_all = 1; + + if (send_lookup_to_all) { + /* Everytime in stripe lookup, all child nodes + should be looked up */ + local->call_count = priv->child_count; + trav = this->children; + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_inode_lookup_cbk, + trav->xlator, + trav->xlator->fops->lookup, + loc, xattr_req); + trav = trav->next; + } + } else { + local->call_count = 1; + + STACK_WIND (frame, + stripe_stack_unwind_inode_lookup_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, + loc, xattr_req); + } + + return 0; +} + +/** + * stripe_stat - + */ +int32_t +stripe_stat (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + int send_lookup_to_all = 0; + xlator_list_t *trav = NULL; + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode)) + send_lookup_to_all = 1; + + if (!send_lookup_to_all) { + STACK_WIND (frame, + stripe_common_buf_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, + loc); + } else { + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->inode = loc->inode; + local->call_count = priv->child_count; + + trav = this->children; + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_buf_cbk, + trav->xlator, + trav->xlator->fops->stat, + loc); + trav = trav->next; + } + } + return 0; +} + + +/** + * stripe_chmod - + */ +int32_t +stripe_chmod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + int send_fop_to_all = 0; + xlator_list_t *trav = NULL; + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning ENOTCONN"); + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + + if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode)) + send_fop_to_all = 1; + + if (!send_fop_to_all) { + STACK_WIND (frame, + stripe_common_buf_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->chmod, + loc, mode); + } else { + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->inode = loc->inode; + local->call_count = priv->child_count; + + trav = this->children; + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_buf_cbk, + trav->xlator, + trav->xlator->fops->chmod, + loc, mode); + trav = trav->next; + } + } + return 0; +} + + +/** + * stripe_chown - + */ +int32_t +stripe_chown (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + uid_t uid, + gid_t gid) +{ + int send_fop_to_all = 0; + xlator_list_t *trav = NULL; + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning ENOTCONN"); + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + + if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode)) + send_fop_to_all = 1; + + trav = this->children; + if (!send_fop_to_all) { + STACK_WIND (frame, + stripe_common_buf_cbk, + trav->xlator, + trav->xlator->fops->chown, + loc, uid, gid); + } else { + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->inode = loc->inode; + local->call_count = priv->child_count; + + trav = this->children; + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_buf_cbk, + trav->xlator, + trav->xlator->fops->chown, + loc, uid, gid); + trav = trav->next; + } + } + + return 0; +} + + +/** + * stripe_statfs_cbk - + */ +int32_t +stripe_statfs_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct statvfs *stbuf) +{ + stripe_local_t *local = (stripe_local_t *)frame->local; + int32_t callcnt; + LOCK(&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret != 0 && op_errno != ENOTCONN) { + local->op_errno = op_errno; + } + if (op_ret == 0) { + struct statvfs *dict_buf = &local->statvfs_buf; + dict_buf->f_bsize = stbuf->f_bsize; + dict_buf->f_frsize = stbuf->f_frsize; + dict_buf->f_blocks += stbuf->f_blocks; + dict_buf->f_bfree += stbuf->f_bfree; + dict_buf->f_bavail += stbuf->f_bavail; + dict_buf->f_files += stbuf->f_files; + dict_buf->f_ffree += stbuf->f_ffree; + dict_buf->f_favail += stbuf->f_favail; + dict_buf->f_fsid = stbuf->f_fsid; + dict_buf->f_flag = stbuf->f_flag; + dict_buf->f_namemax = stbuf->f_namemax; + local->op_ret = 0; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + STACK_UNWIND (frame, local->op_ret, + local->op_errno, &local->statvfs_buf); + } + + return 0; +} + + +/** + * stripe_statfs - + */ +int32_t +stripe_statfs (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + stripe_local_t *local = NULL; + xlator_list_t *trav = this->children; + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + local->op_errno = ENOTCONN; + frame->local = local; + + local->call_count = ((stripe_private_t *)this->private)->child_count; + while (trav) { + STACK_WIND (frame, + stripe_statfs_cbk, + trav->xlator, + trav->xlator->fops->statfs, + loc); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_truncate - + */ +int32_t +stripe_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset) +{ + int send_fop_to_all = 0; + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning ENOTCONN"); + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + + if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode)) + send_fop_to_all = 1; + + if (!send_fop_to_all) { + STACK_WIND (frame, + stripe_common_buf_cbk, + trav->xlator, + trav->xlator->fops->truncate, + loc, + offset); + } else { + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->inode = loc->inode; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_buf_cbk, + trav->xlator, + trav->xlator->fops->truncate, + loc, + offset); + trav = trav->next; + } + } + + return 0; +} + + +/** + * stripe_utimens - + */ +int32_t +stripe_utimens (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec tv[2]) +{ + int send_fop_to_all = 0; + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning ENOTCONN"); + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + + if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode)) + send_fop_to_all = 1; + + if (!send_fop_to_all) { + STACK_WIND (frame, + stripe_common_buf_cbk, + trav->xlator, + trav->xlator->fops->utimens, + loc, tv); + } else { + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->inode = loc->inode; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_buf_cbk, + trav->xlator, + trav->xlator->fops->utimens, + loc, tv); + trav = trav->next; + } + } + return 0; +} + + +int32_t +stripe_first_rename_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + stripe_local_t *local = frame->local; + xlator_list_t *trav = this->children; + + if (op_ret == -1) + { + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; + } + + local->op_ret = 0; + local->stbuf = *buf; + local->call_count--; + trav = trav->next; /* Skip first child */ + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_buf_cbk, + trav->xlator, + trav->xlator->fops->rename, + &local->loc, &local->loc2); + trav = trav->next; + } + + return 0; +} +/** + * stripe_rename - + */ +int32_t +stripe_rename (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + stripe_private_t *priv = this->private; + stripe_local_t *local = NULL; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (oldloc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning ENOTCONN"); + STACK_UNWIND (frame, -1, EIO, NULL); + return 0; + } + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + local->inode = oldloc->inode; + loc_copy (&local->loc, oldloc); + loc_copy (&local->loc2, newloc); + + local->call_count = priv->child_count; + + frame->local = local; + + STACK_WIND (frame, + stripe_first_rename_cbk, + trav->xlator, + trav->xlator->fops->rename, + oldloc, newloc); + + return 0; +} + + +/** + * stripe_access - + */ +int32_t +stripe_access (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t mask) +{ + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + STACK_WIND (frame, + stripe_common_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->access, + loc, mask); + + return 0; +} + + +/** + * stripe_readlink_cbk - + */ +int32_t +stripe_readlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + const char *path) +{ + STACK_UNWIND (frame, op_ret, op_errno, path); + + return 0; +} + + +/** + * stripe_readlink - + */ +int32_t +stripe_readlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + size_t size) +{ + stripe_private_t *priv = this->private; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning ENOTCONN"); + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + + STACK_WIND (frame, + stripe_readlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readlink, + loc, size); + + return 0; +} + + +/** + * stripe_unlink - + */ +int32_t +stripe_unlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + int send_fop_to_all = 0; + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning EIO"); + STACK_UNWIND (frame, -1, EIO); + return 0; + } + + if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode)) + send_fop_to_all = 1; + + if (!send_fop_to_all) { + STACK_WIND (frame, + stripe_common_cbk, + trav->xlator, + trav->xlator->fops->unlink, + loc); + } else { + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_cbk, + trav->xlator, + trav->xlator->fops->unlink, + loc); + trav = trav->next; + } + } + + return 0; +} + + +int32_t +stripe_first_rmdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + xlator_list_t *trav = this->children; + stripe_local_t *local = frame->local; + + if (op_ret == -1) + { + STACK_UNWIND (frame, op_ret, op_errno); + return 0; + } + + local->call_count--; /* First child successful */ + trav = trav->next; /* Skip first child */ + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_cbk, + trav->xlator, + trav->xlator->fops->rmdir, + &local->loc); + trav = trav->next; + } + + return 0; +} + +/** + * stripe_rmdir - + */ +int32_t +stripe_rmdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + stripe_local_t *local = NULL; + xlator_list_t *trav = this->children; + stripe_private_t *priv = this->private; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning EIO"); + STACK_UNWIND (frame, -1, EIO); + return 0; + } + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->inode = loc->inode; + loc_copy (&local->loc, loc); + local->call_count = priv->child_count; + + STACK_WIND (frame, + stripe_first_rmdir_cbk, + trav->xlator, + trav->xlator->fops->rmdir, + loc); + + return 0; +} + + +/** + * stripe_setxattr - + */ +int32_t +stripe_setxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *dict, + int32_t flags) +{ + stripe_private_t *priv = this->private; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning ENOTCONN"); + STACK_UNWIND (frame, -1, ENOTCONN); + return 0; + } + + STACK_WIND (frame, + stripe_common_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + loc, dict, flags); + + return 0; +} + + +int32_t +stripe_mknod_ifreg_fail_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + loc_wipe (&local->loc); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf); + } + + return 0; +} + + +/** + */ +int32_t +stripe_mknod_ifreg_setxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->op_ret == -1) { + local->call_count = priv->child_count; + while (trav) { + STACK_WIND (frame, + stripe_mknod_ifreg_fail_unlink_cbk, + trav->xlator, + trav->xlator->fops->unlink, + &local->loc); + trav = trav->next; + } + return 0; + } + + loc_wipe (&local->loc); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf); + } + return 0; +} + +/** + */ +int32_t +stripe_mknod_ifreg_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + int ret = 0; + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + stripe_private_t *priv = this->private; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->failed = 1; + local->op_errno = op_errno; + } + + if (op_ret >= 0) { + local->op_ret = op_ret; + /* Get the mapping in inode private */ + /* Get the stat buf right */ + if (local->stbuf.st_blksize == 0) { + local->stbuf = *buf; + /* Because st_blocks gets added again */ + local->stbuf.st_blocks = 0; + } + + /* Always, pass the inode number of first child + to the above layer */ + if (FIRST_CHILD(this) == + ((call_frame_t *)cookie)->this) + local->stbuf.st_ino = buf->st_ino; + + local->stbuf.st_blocks += buf->st_blocks; + if (local->stbuf.st_size < buf->st_size) + local->stbuf.st_size = buf->st_size; + if (local->stbuf.st_blksize != buf->st_blksize) { + /* TODO: add to blocks in terms of + original block size */ + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + if ((local->op_ret != -1) && priv->xattr_supported) { + /* Send a setxattr request to nodes where the + files are created */ + int32_t index = 0; + char size_key[256] = {0,}; + char index_key[256] = {0,}; + char count_key[256] = {0,}; + xlator_list_t *trav = this->children; + dict_t *dict = NULL; + + sprintf (size_key, + "trusted.%s.stripe-size", this->name); + sprintf (count_key, + "trusted.%s.stripe-count", this->name); + sprintf (index_key, + "trusted.%s.stripe-index", this->name); + + local->call_count = priv->child_count; + + while (trav) { + dict = get_new_dict (); + dict_ref (dict); + /* TODO: check return value */ + ret = dict_set_int64 (dict, size_key, + local->stripe_size); + ret = dict_set_int32 (dict, count_key, + local->call_count); + ret = dict_set_int32 (dict, index_key, index); + + STACK_WIND (frame, + stripe_mknod_ifreg_setxattr_cbk, + trav->xlator, + trav->xlator->fops->setxattr, + &local->loc, dict, 0); + + dict_unref (dict); + index++; + trav = trav->next; + } + } else { + /* Create itself has failed.. so return + without setxattring */ + loc_wipe (&local->loc); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf); + } + } + + return 0; +} + + +/** + * stripe_mknod - + */ +int32_t +stripe_mknod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode, + dev_t rdev) +{ + stripe_private_t *priv = this->private; + stripe_local_t *local = NULL; + xlator_list_t *trav = NULL; + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning EIO"); + STACK_UNWIND (frame, -1, EIO, NULL, NULL); + return 0; + } + + if (S_ISREG(mode)) { + /* NOTE: on older kernels (older than 2.6.9), + creat() fops is sent as mknod() + open(). Hence handling + S_IFREG files is necessary */ + if (priv->nodes_down) { + gf_log (this->name, GF_LOG_WARNING, + "Some node down, returning EIO"); + STACK_UNWIND (frame, -1, EIO, loc->inode, NULL); + return 0; + } + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + local->op_errno = ENOTCONN; + local->stripe_size = stripe_get_matching_bs (loc->path, + priv->pattern, + priv->block_size); + frame->local = local; + local->inode = loc->inode; + loc_copy (&local->loc, loc); + + /* Everytime in stripe lookup, all child nodes should + be looked up */ + local->call_count = + ((stripe_private_t *)this->private)->child_count; + + trav = this->children; + while (trav) { + STACK_WIND (frame, + stripe_mknod_ifreg_cbk, + trav->xlator, + trav->xlator->fops->mknod, + loc, mode, rdev); + trav = trav->next; + } + + /* This case is handled, no need to continue further. */ + return 0; + } + + + STACK_WIND (frame, + stripe_common_inode_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, + loc, mode, rdev); + + return 0; +} + + +/** + * stripe_mkdir - + */ +int32_t +stripe_mkdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + stripe_private_t *priv = this->private; + stripe_local_t *local = NULL; + xlator_list_t *trav = NULL; + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning EIO"); + STACK_UNWIND (frame, -1, EIO, NULL, NULL); + return 0; + } + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + local->call_count = priv->child_count; + frame->local = local; + + /* Everytime in stripe lookup, all child nodes should be looked up */ + trav = this->children; + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_inode_cbk, + trav->xlator, + trav->xlator->fops->mkdir, + loc, mode); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_symlink - + */ +int32_t +stripe_symlink (call_frame_t *frame, + xlator_t *this, + const char *linkpath, + loc_t *loc) +{ + stripe_private_t *priv = this->private; + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning EIO"); + STACK_UNWIND (frame, -1, EIO, NULL, NULL); + return 0; + } + + /* send symlink to only first node */ + STACK_WIND (frame, + stripe_common_inode_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, + linkpath, loc); + + return 0; +} + +/** + * stripe_link - + */ +int32_t +stripe_link (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + int send_fop_to_all = 0; + stripe_private_t *priv = this->private; + stripe_local_t *local = NULL; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (oldloc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning EIO"); + STACK_UNWIND (frame, -1, EIO, NULL, NULL); + return 0; + } + + + if (S_ISREG (oldloc->inode->st_mode)) + send_fop_to_all = 1; + + if (!send_fop_to_all) { + STACK_WIND (frame, + stripe_common_inode_cbk, + trav->xlator, + trav->xlator->fops->link, + oldloc, newloc); + } else { + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->call_count = priv->child_count; + + /* Everytime in stripe lookup, all child + nodes should be looked up */ + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_inode_cbk, + trav->xlator, + trav->xlator->fops->link, + oldloc, newloc); + trav = trav->next; + } + } + + return 0; +} + +int32_t +stripe_create_fail_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = 0; + fd_t *lfd = NULL; + stripe_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + lfd = local->fd; + loc_wipe (&local->loc); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->fd, local->inode, &local->stbuf); + fd_unref (lfd); + } + return 0; +} + + +/** + * stripe_create_setxattr_cbk - + */ +int32_t +stripe_create_setxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + fd_t *lfd = NULL; + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->op_ret == -1) { + local->call_count = priv->child_count; + while (trav) { + STACK_WIND (frame, + stripe_create_fail_unlink_cbk, + trav->xlator, + trav->xlator->fops->unlink, + &local->loc); + trav = trav->next; + } + + return 0; + } + + lfd = local->fd; + loc_wipe (&local->loc); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->fd, local->inode, &local->stbuf); + fd_unref (lfd); + } + + return 0; +} + +/** + * stripe_create_cbk - + */ +int32_t +stripe_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + stripe_private_t *priv = this->private; + fd_t *lfd = NULL; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->failed = 1; + local->op_errno = op_errno; + } + + if (op_ret >= 0) { + local->op_ret = op_ret; + /* Get the mapping in inode private */ + /* Get the stat buf right */ + if (local->stbuf.st_blksize == 0) { + local->stbuf = *buf; + /* Because st_blocks gets added again */ + local->stbuf.st_blocks = 0; + } + + /* Always, pass the inode number of first + child to the above layer */ + if (FIRST_CHILD(this) == + ((call_frame_t *)cookie)->this) + local->stbuf.st_ino = buf->st_ino; + + local->stbuf.st_blocks += buf->st_blocks; + if (local->stbuf.st_size < buf->st_size) + local->stbuf.st_size = buf->st_size; + if (local->stbuf.st_blksize != buf->st_blksize) { + /* TODO: add to blocks in terms of + original block size */ + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + if (local->op_ret >= 0) { + fd_ctx_set (local->fd, this, local->stripe_size); + } + + if ((local->op_ret != -1) && + local->stripe_size && priv->xattr_supported) { + /* Send a setxattr request to nodes where + the files are created */ + int ret = 0; + int32_t index = 0; + char size_key[256] = {0,}; + char index_key[256] = {0,}; + char count_key[256] = {0,}; + xlator_list_t *trav = this->children; + dict_t *dict = NULL; + + sprintf (size_key, + "trusted.%s.stripe-size", this->name); + sprintf (count_key, + "trusted.%s.stripe-count", this->name); + sprintf (index_key, + "trusted.%s.stripe-index", this->name); + + local->call_count = priv->child_count; + + while (trav) { + dict = get_new_dict (); + dict_ref (dict); + + /* TODO: check return values */ + ret = dict_set_int64 (dict, size_key, + local->stripe_size); + ret = dict_set_int32 (dict, count_key, + local->call_count); + ret = dict_set_int32 (dict, index_key, index); + + STACK_WIND (frame, + stripe_create_setxattr_cbk, + trav->xlator, + trav->xlator->fops->setxattr, + &local->loc, + dict, + 0); + + dict_unref (dict); + index++; + trav = trav->next; + } + } else { + /* Create itself has failed.. so return + without setxattring */ + lfd = local->fd; + loc_wipe (&local->loc); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->fd, local->inode, &local->stbuf); + + fd_unref (lfd); + } + } + + return 0; +} + + +/** + * stripe_create - If a block-size is specified for the 'name', create the + * file in all the child nodes. If not, create it in only first child. + * + * @name- complete path of the file to be created. + */ +int32_t +stripe_create (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + mode_t mode, + fd_t *fd) +{ + stripe_private_t *priv = this->private; + stripe_local_t *local = NULL; + xlator_list_t *trav = NULL; + + /* files created in O_APPEND mode does not allow lseek() on fd */ + flags &= ~O_APPEND; + + if (priv->first_child_down || priv->nodes_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning EIO"); + STACK_UNWIND (frame, -1, EIO, fd, loc->inode, NULL); + return 0; + } + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + local->op_errno = ENOTCONN; + local->stripe_size = stripe_get_matching_bs (loc->path, + priv->pattern, + priv->block_size); + frame->local = local; + local->inode = loc->inode; + loc_copy (&local->loc, loc); + local->fd = fd_ref (fd); + + local->call_count = ((stripe_private_t *)this->private)->child_count; + + trav = this->children; + while (trav) { + STACK_WIND (frame, + stripe_create_cbk, + trav->xlator, + trav->xlator->fops->create, + loc, flags, mode, fd); + trav = trav->next; + } + + return 0; +} + +/** + * stripe_open_cbk - + */ +int32_t +stripe_open_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + local->failed = 1; + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + } + + if (op_ret >= 0) + local->op_ret = op_ret; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + if (local->op_ret >= 0) { + fd_ctx_set (local->fd, this, local->stripe_size); + } + loc_wipe (&local->loc); + STACK_UNWIND (frame, local->op_ret, local->op_errno, fd); + } + + return 0; +} + + +/** + * stripe_getxattr_cbk - + */ +int32_t +stripe_open_getxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + xlator_list_t *trav = this->children; + stripe_private_t *priv = this->private; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + if (op_errno == ENOTCONN) + local->failed = 1; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (!local->failed && (local->op_ret != -1)) { + /* If getxattr doesn't fails, call open */ + char size_key[256] = {0,}; + data_t *stripe_size_data = NULL; + + sprintf (size_key, + "trusted.%s.stripe-size", this->name); + stripe_size_data = dict_get (dict, size_key); + + if (stripe_size_data) { + local->stripe_size = + data_to_int64 (stripe_size_data); + /* + if (local->stripe_size != priv->block_size) { + gf_log (this->name, GF_LOG_WARNING, + "file(%s) is having different " + "block-size", local->loc.path); + } + */ + } else { + /* if the file was created using earlier + versions of stripe */ + gf_log (this->name, GF_LOG_CRITICAL, + "[CRITICAL] Seems like file(%s) " + "created using earlier version", + local->loc.path); + } + } + + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_open_cbk, + trav->xlator, + trav->xlator->fops->open, + &local->loc, local->flags, local->fd); + trav = trav->next; + } + } + + return 0; +} + +/** + * stripe_open - + */ +int32_t +stripe_open (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + fd_t *fd) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* files opened in O_APPEND mode does not allow lseek() on fd */ + flags &= ~O_APPEND; + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning ENOTCONN"); + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->fd = fd; + frame->local = local; + local->inode = loc->inode; + loc_copy (&local->loc, loc); + + /* Striped files */ + local->flags = flags; + local->call_count = priv->child_count; + local->stripe_size = stripe_get_matching_bs (loc->path, + priv->pattern, + priv->block_size); + + if (priv->xattr_supported) { + while (trav) { + STACK_WIND (frame, + stripe_open_getxattr_cbk, + trav->xlator, + trav->xlator->fops->getxattr, + loc, NULL); + trav = trav->next; + } + } else { + while (trav) { + STACK_WIND (frame, + stripe_open_cbk, + trav->xlator, + trav->xlator->fops->open, + &local->loc, local->flags, local->fd); + trav = trav->next; + } + } + + return 0; +} + +/** + * stripe_opendir_cbk - + */ +int32_t +stripe_opendir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_ret = -1; + local->failed = 1; + local->op_errno = op_errno; + } + + if (op_ret >= 0) + local->op_ret = op_ret; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + STACK_UNWIND (frame, local->op_ret, + local->op_errno, local->fd); + } + + return 0; +} + + +/** + * stripe_opendir - + */ +int32_t +stripe_opendir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + fd_t *fd) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning EIO"); + STACK_UNWIND (frame, -1, EIO, NULL); + return 0; + } + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + frame->local = local; + local->inode = loc->inode; + local->fd = fd; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_opendir_cbk, + trav->xlator, + trav->xlator->fops->opendir, + loc, fd); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_getxattr_cbk - + */ +int32_t +stripe_getxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *value) +{ + STACK_UNWIND (frame, op_ret, op_errno, value); + return 0; +} + + +/** + * stripe_getxattr - + */ +int32_t +stripe_getxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + STACK_WIND (frame, + stripe_getxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + loc, name); + + return 0; +} + +/** + * stripe_removexattr - + */ +int32_t +stripe_removexattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + stripe_private_t *priv = this->private; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning ENOTCONN"); + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + + STACK_WIND (frame, + stripe_common_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, + loc, name); + + return 0; +} + + +/** + * stripe_lk_cbk - + */ +int32_t +stripe_lk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *lock) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_errno = op_errno; + if (op_errno == ENOTCONN) + local->failed = 1; + } + if (op_ret == 0 && local->op_ret == -1) { + /* First successful call, copy the *lock */ + local->op_ret = 0; + local->lock = *lock; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + STACK_UNWIND (frame, local->op_ret, + local->op_errno, &local->lock); + } + return 0; +} + + +/** + * stripe_lk - + */ +int32_t +stripe_lk (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct flock *lock) +{ + stripe_local_t *local = NULL; + xlator_list_t *trav = this->children; + stripe_private_t *priv = this->private; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd); + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_lk_cbk, + trav->xlator, + trav->xlator->fops->lk, + fd, cmd, lock); + trav = trav->next; + } + + return 0; +} + +/** + * stripe_writedir - + */ +int32_t +stripe_setdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags, + dir_entry_t *entries, + int32_t count) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd); + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_cbk, + trav->xlator, + trav->xlator->fops->setdents, + fd, flags, entries, count); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_flush - + */ +int32_t +stripe_flush (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd); + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_cbk, + trav->xlator, + trav->xlator->fops->flush, + fd); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_close - + */ +int32_t +stripe_release (xlator_t *this, + fd_t *fd) +{ + return 0; +} + + +/** + * stripe_fsync - + */ +int32_t +stripe_fsync (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd); + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_cbk, + trav->xlator, + trav->xlator->fops->fsync, + fd, flags); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_fstat - + */ +int32_t +stripe_fstat (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd); + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->inode = fd->inode; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_buf_cbk, + trav->xlator, + trav->xlator->fops->fstat, + fd); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_fchmod - + */ +int32_t +stripe_fchmod (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + mode_t mode) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd); + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->inode = fd->inode; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_buf_cbk, + trav->xlator, + trav->xlator->fops->fchmod, + fd, mode); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_fchown - + */ +int32_t +stripe_fchown (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + uid_t uid, + gid_t gid) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd); + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->inode = fd->inode; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_buf_cbk, + trav->xlator, + trav->xlator->fops->fchown, + fd, uid, gid); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_ftruncate - + */ +int32_t +stripe_ftruncate (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd); + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->inode = fd->inode; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_buf_cbk, + trav->xlator, + trav->xlator->fops->ftruncate, + fd, offset); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_releasedir - + */ +int32_t +stripe_releasedir (xlator_t *this, + fd_t *fd) +{ + return 0; +} + + +/** + * stripe_fsyncdir - + */ +int32_t +stripe_fsyncdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd); + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_cbk, + trav->xlator, + trav->xlator->fops->fsyncdir, + fd, + flags); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_single_readv_cbk - This function is used as return fn, when the + * file name doesn't match the pattern specified for striping. + */ +int32_t +stripe_single_readv_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) +{ + STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf); + return 0; +} + +/** + * stripe_readv_cbk - get all the striped reads, and order it properly, send it + * to above layer after putting it in a single vector. + */ +int32_t +stripe_readv_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) +{ + int32_t index = 0; + int32_t callcnt = 0; + call_frame_t *main_frame = NULL; + stripe_local_t *main_local = NULL; + stripe_local_t *local = frame->local; + + index = local->node_index; + main_frame = local->orig_frame; + main_local = main_frame->local; + + LOCK (&main_frame->lock); + { + main_local->replies[index].op_ret = op_ret; + main_local->replies[index].op_errno = op_errno; + if (op_ret >= 0) { + main_local->replies[index].stbuf = *stbuf; + main_local->replies[index].count = count; + main_local->replies[index].vector = + iov_dup (vector, count); + + if (frame->root->rsp_refs) + dict_copy (frame->root->rsp_refs, + main_frame->root->rsp_refs); + } + callcnt = ++main_local->call_count; + } + UNLOCK(&main_frame->lock); + + if (callcnt == main_local->wind_count) { + int32_t final_count = 0; + struct iovec *final_vec = NULL; + struct stat tmp_stbuf = {0,}; + dict_t *refs = main_frame->root->rsp_refs; + + op_ret = 0; + memcpy (&tmp_stbuf, &main_local->replies[0].stbuf, + sizeof (struct stat)); + for (index=0; index < main_local->wind_count; index++) { + /* TODO: check whether each stripe returned 'expected' + * number of bytes + */ + if (main_local->replies[index].op_ret == -1) { + op_ret = -1; + op_errno = main_local->replies[index].op_errno; + break; + } + op_ret += main_local->replies[index].op_ret; + final_count += main_local->replies[index].count; + /* TODO: Do I need to send anything more in stbuf? */ + if (tmp_stbuf.st_size < + main_local->replies[index].stbuf.st_size) { + tmp_stbuf.st_size = + main_local->replies[index].stbuf.st_size; + } + } + if (op_ret != -1) { + final_vec = CALLOC (final_count, + sizeof (struct iovec)); + ERR_ABORT (final_vec); + final_count = 0; + + for (index=0; + index < main_local->wind_count; index++) { + memcpy (final_vec + final_count, + main_local->replies[index].vector, + (main_local->replies[index].count * + sizeof (struct iovec))); + final_count += + main_local->replies[index].count; + + free (main_local->replies[index].vector); + } + } else { + final_vec = NULL; + final_count = 0; + } + /* */ + FREE (main_local->replies); + refs = main_frame->root->rsp_refs; + STACK_UNWIND (main_frame, op_ret, op_errno, + final_vec, final_count, &tmp_stbuf); + + dict_unref (refs); + if (final_vec) + free (final_vec); + } + + STACK_DESTROY (frame->root); + return 0; +} + +/** + * stripe_readv - + */ +int32_t +stripe_readv (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + int32_t index = 0; + int32_t num_stripe = 0; + size_t frame_size = 0; + off_t rounded_end = 0; + uint64_t stripe_size = 0; + off_t rounded_start = 0; + off_t frame_offset = offset; + stripe_local_t *local = NULL; + call_frame_t *rframe = NULL; + stripe_local_t *rlocal = NULL; + xlator_list_t *trav = this->children; + stripe_private_t *priv = this->private; + + fd_ctx_get (fd, this, &stripe_size); + if (!stripe_size) { + STACK_UNWIND (frame, -1, EINVAL, NULL, 0, NULL); + return 0; + } + + /* The file is stripe across the child nodes. Send the read request + * to the child nodes appropriately after checking which region of + * the file is in which child node. Always '0-<stripe_size>' part of + * the file resides in the first child. + */ + rounded_start = floor (offset, stripe_size); + rounded_end = roof (offset+size, stripe_size); + num_stripe = (rounded_end - rounded_start) / stripe_size; + + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->wind_count = num_stripe; + frame->local = local; + frame->root->rsp_refs = dict_ref (get_new_dict ()); + + /* This is where all the vectors should be copied. */ + local->replies = CALLOC (1, num_stripe * + sizeof (struct readv_replies)); + ERR_ABORT (local->replies); + + for (index = 0; + index < ((offset / stripe_size) % priv->child_count); + index++) { + trav = trav->next; + } + + for (index = 0; index < num_stripe; index++) { + rframe = copy_frame (frame); + rlocal = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (rlocal); + + frame_size = min (roof (frame_offset+1, stripe_size), + (offset + size)) - frame_offset; + + rlocal->node_index = index; + rlocal->orig_frame = frame; + rframe->local = rlocal; + STACK_WIND (rframe, + stripe_readv_cbk, + trav->xlator, + trav->xlator->fops->readv, + fd, frame_size, frame_offset); + + frame_offset += frame_size; + + trav = trav->next ? trav->next : this->children; + } + + return 0; +} + + +/** + * stripe_writev_cbk - + */ +int32_t +stripe_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + LOCK(&frame->lock); + { + callcnt = ++local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_errno = op_errno; + local->op_ret = -1; + } + if (op_ret >= 0) { + local->op_ret += op_ret; + local->stbuf = *stbuf; + } + } + UNLOCK (&frame->lock); + + if ((callcnt == local->wind_count) && local->unwind) { + STACK_UNWIND (frame, local->op_ret, + local->op_errno, &local->stbuf); + } + return 0; +} + + +/** + * stripe_single_writev_cbk - + */ +int32_t +stripe_single_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + STACK_UNWIND (frame, op_ret, op_errno, stbuf); + return 0; +} +/** + * stripe_writev - + */ +int32_t +stripe_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t offset) +{ + int32_t idx = 0; + int32_t total_size = 0; + int32_t offset_offset = 0; + int32_t remaining_size = 0; + int32_t tmp_count = count; + off_t fill_size = 0; + uint64_t stripe_size = 0; + struct iovec *tmp_vec = vector; + stripe_private_t *priv = this->private; + stripe_local_t *local = NULL; + xlator_list_t *trav = NULL; + + fd_ctx_get (fd, this, &stripe_size); + if (!stripe_size) { + STACK_UNWIND (frame, -1, EINVAL, NULL); + return 0; + } + + /* File has to be stripped across the child nodes */ + for (idx = 0; idx< count; idx ++) { + total_size += tmp_vec[idx].iov_len; + } + remaining_size = total_size; + + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + frame->local = local; + local->stripe_size = stripe_size; + + while (1) { + /* Send striped chunk of the vector to child + nodes appropriately. */ + trav = this->children; + + idx = (((offset + offset_offset) / + local->stripe_size) % priv->child_count); + while (idx) { + trav = trav->next; + idx--; + } + fill_size = (local->stripe_size - + ((offset + offset_offset) % local->stripe_size)); + if (fill_size > remaining_size) + fill_size = remaining_size; + + remaining_size -= fill_size; + + tmp_count = iov_subset (vector, count, offset_offset, + offset_offset + fill_size, NULL); + tmp_vec = CALLOC (tmp_count, sizeof (struct iovec)); + ERR_ABORT (tmp_vec); + tmp_count = iov_subset (vector, count, offset_offset, + offset_offset + fill_size, tmp_vec); + + local->wind_count++; + if (remaining_size == 0) + local->unwind = 1; + + STACK_WIND(frame, + stripe_writev_cbk, + trav->xlator, + trav->xlator->fops->writev, + fd, tmp_vec, tmp_count, offset + offset_offset); + FREE (tmp_vec); + offset_offset += fill_size; + if (remaining_size == 0) + break; + } + + return 0; +} + + + +/* Management operations */ + +/** + * stripe_stats_cbk - Add all the fields received from different clients. + * Once all the clients return, send stats to above layer. + * + */ +int32_t +stripe_stats_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct xlator_stats *stats) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + + LOCK(&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + } + if (op_ret == 0) { + if (local->op_ret == -2) { + /* This is to make sure this is the + first time */ + local->stats = *stats; + local->op_ret = 0; + } else { + local->stats.nr_files += stats->nr_files; + local->stats.free_disk += stats->free_disk; + local->stats.disk_usage += stats->disk_usage; + local->stats.nr_clients += stats->nr_clients; + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stats); + } + + return 0; +} + +/** + * stripe_stats - + */ +int32_t +stripe_stats (call_frame_t *frame, + xlator_t *this, + int32_t flags) +{ + stripe_local_t *local = NULL; + xlator_list_t *trav = this->children; + + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + frame->local = local; + local->op_ret = -2; /* to be used as a flag in _cbk */ + local->call_count = ((stripe_private_t*)this->private)->child_count; + while (trav) { + STACK_WIND (frame, + stripe_stats_cbk, + trav->xlator, + trav->xlator->mops->stats, + flags); + trav = trav->next; + } + return 0; +} + +/** + * notify + */ +int32_t +notify (xlator_t *this, + int32_t event, + void *data, + ...) +{ + stripe_private_t *priv = this->private; + int down_client = 0; + int i = 0; + + if (!priv) + return 0; + + switch (event) + { + case GF_EVENT_CHILD_UP: + { + /* get an index number to set */ + for (i = 0; i < priv->child_count; i++) { + if (data == priv->xl_array[i]) + break; + } + priv->state[i] = 1; + for (i = 0; i < priv->child_count; i++) { + if (!priv->state[i]) + down_client++; + } + + LOCK (&priv->lock); + { + priv->nodes_down = down_client; + + if (data == FIRST_CHILD (this)) { + priv->first_child_down = 0; + default_notify (this, event, data); + } + } + UNLOCK (&priv->lock); + } + break; + case GF_EVENT_CHILD_DOWN: + { + /* get an index number to set */ + for (i = 0; i < priv->child_count; i++) { + if (data == priv->xl_array[i]) + break; + } + priv->state[i] = 0; + for (i = 0; i < priv->child_count; i++) { + if (!priv->state[i]) + down_client++; + } + + LOCK (&priv->lock); + { + priv->nodes_down = down_client; + + if (data == FIRST_CHILD (this)) { + priv->first_child_down = 1; + default_notify (this, event, data); + } + } + UNLOCK (&priv->lock); + } + break; + + default: + { + /* */ + default_notify (this, event, data); + } + break; + } + + return 0; +} +/** + * init - This function is called when xlator-graph gets initialized. + * The option given in volfiles are parsed here. + * @this - + */ +int32_t +init (xlator_t *this) +{ + stripe_private_t *priv = NULL; + xlator_list_t *trav = NULL; + data_t *data = NULL; + int32_t count = 0; + + trav = this->children; + while (trav) { + count++; + trav = trav->next; + } + + if (!count) { + gf_log (this->name, GF_LOG_ERROR, + "stripe configured without \"subvolumes\" option. " + "exiting"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + priv = CALLOC (1, sizeof (stripe_private_t)); + ERR_ABORT (priv); + priv->xl_array = CALLOC (1, count * sizeof (xlator_t *)); + ERR_ABORT (priv->xl_array); + priv->child_count = count; + LOCK_INIT (&priv->lock); + + trav = this->children; + count = 0; + while (trav) { + priv->xl_array[count++] = trav->xlator; + trav = trav->next; + } + + if (count > 256) { + gf_log (this->name, GF_LOG_ERROR, + "maximum number of stripe subvolumes supported " + "is 256"); + return -1; + } + + priv->block_size = (128 * GF_UNIT_KB); + /* option stripe-pattern *avi:1GB,*pdf:4096 */ + data = dict_get (this->options, "block-size"); + if (!data) { + gf_log (this->name, GF_LOG_DEBUG, + "No \"option block-size <x>\" given, defaulting " + "to 128KB"); + } else { + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *dup_str = NULL; + char *stripe_str = NULL; + char *pattern = NULL; + char *num = NULL; + struct stripe_options *temp_stripeopt = NULL; + struct stripe_options *stripe_opt = NULL; + + /* Get the pattern for striping. + "option block-size *avi:10MB" etc */ + stripe_str = strtok_r (data->data, ",", &tmp_str); + while (stripe_str) { + dup_str = strdup (stripe_str); + stripe_opt = CALLOC (1, + sizeof (struct stripe_options)); + ERR_ABORT (stripe_opt); + pattern = strtok_r (dup_str, ":", &tmp_str1); + num = strtok_r (NULL, ":", &tmp_str1); + if (num && + (gf_string2bytesize (num, + &stripe_opt->block_size) + != 0)) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", + num); + return -1; + } else if (!num && (gf_string2bytesize ( + pattern, + &stripe_opt->block_size) + != 0)) { + /* Possible that there is no pattern given */ + stripe_opt->block_size = (128 * GF_UNIT_KB); + pattern = "*"; + } + memcpy (stripe_opt->path_pattern, + pattern, strlen (pattern)); + + gf_log (this->name, GF_LOG_DEBUG, + "block-size : pattern %s : size %"PRId64, + stripe_opt->path_pattern, + stripe_opt->block_size); + + if (!priv->pattern) { + priv->pattern = stripe_opt; + } else { + temp_stripeopt = priv->pattern; + while (temp_stripeopt->next) + temp_stripeopt = temp_stripeopt->next; + temp_stripeopt->next = stripe_opt; + } + stripe_str = strtok_r (NULL, ",", &tmp_str); + } + } + + priv->xattr_supported = 1; + data = dict_get (this->options, "use-xattr"); + if (data) { + if (gf_string2boolean (data->data, + &priv->xattr_supported) == -1) { + gf_log (this->name, GF_LOG_ERROR, + "error setting hard check for extended " + "attribute"); + //return -1; + } + } + + /* notify related */ + priv->nodes_down = priv->child_count; + this->private = priv; + + return 0; +} + +/** + * fini - Free all the private variables + * @this - + */ +void +fini (xlator_t *this) +{ + stripe_private_t *priv = this->private; + struct stripe_options *prev = NULL; + struct stripe_options *trav = priv->pattern; + while (trav) { + prev = trav; + trav = trav->next; + FREE (prev); + } + FREE (priv->xl_array); + LOCK_DESTROY (&priv->lock); + FREE (priv); + return; +} + + +struct xlator_fops fops = { + .stat = stripe_stat, + .unlink = stripe_unlink, + .symlink = stripe_symlink, + .rename = stripe_rename, + .link = stripe_link, + .chmod = stripe_chmod, + .chown = stripe_chown, + .truncate = stripe_truncate, + .utimens = stripe_utimens, + .create = stripe_create, + .open = stripe_open, + .readv = stripe_readv, + .writev = stripe_writev, + .statfs = stripe_statfs, + .flush = stripe_flush, + .fsync = stripe_fsync, + .setxattr = stripe_setxattr, + .getxattr = stripe_getxattr, + .removexattr = stripe_removexattr, + .access = stripe_access, + .ftruncate = stripe_ftruncate, + .fstat = stripe_fstat, + .readlink = stripe_readlink, + .mkdir = stripe_mkdir, + .rmdir = stripe_rmdir, + .lk = stripe_lk, + .opendir = stripe_opendir, + .fsyncdir = stripe_fsyncdir, + .fchmod = stripe_fchmod, + .fchown = stripe_fchown, + .lookup = stripe_lookup, + .setdents = stripe_setdents, + .mknod = stripe_mknod, +}; + +struct xlator_mops mops = { + .stats = stripe_stats, +}; + +struct xlator_cbks cbks = { + .release = stripe_release, + .releasedir = stripe_releasedir +}; + + +struct volume_options options[] = { + { .key = {"block-size"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"use-xattr"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {NULL} }, +}; diff --git a/xlators/cluster/unify/Makefile.am b/xlators/cluster/unify/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/cluster/unify/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/unify/src/Makefile.am b/xlators/cluster/unify/src/Makefile.am new file mode 100644 index 000000000..b9e6f63e9 --- /dev/null +++ b/xlators/cluster/unify/src/Makefile.am @@ -0,0 +1,16 @@ + +xlator_LTLIBRARIES = unify.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +unify_la_LDFLAGS = -module -avoidversion + +unify_la_SOURCES = unify.c unify-self-heal.c +unify_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = unify.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + diff --git a/xlators/cluster/unify/src/unify-self-heal.c b/xlators/cluster/unify/src/unify-self-heal.c new file mode 100644 index 000000000..4885dd91a --- /dev/null +++ b/xlators/cluster/unify/src/unify-self-heal.c @@ -0,0 +1,1225 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +/** + * unify-self-heal.c : + * This file implements few functions which enables 'unify' translator + * to be consistent in its behaviour when + * > a node fails, + * > a node gets added, + * > a failed node comes back + * > a new namespace server is added (ie, an fresh namespace server). + * + * This functionality of 'unify' will enable glusterfs to support storage + * system failure, and maintain consistancy. This works both ways, ie, when + * an entry (either file or directory) is found on namespace server, and not + * on storage nodes, its created in storage nodes and vica-versa. + * + * The two fops, where it can be implemented are 'getdents ()' and 'lookup ()' + * + */ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "unify.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "common-utils.h" + +int32_t +unify_sh_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entry, + int32_t count); + +int32_t +unify_sh_ns_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entry, + int32_t count); + +int32_t +unify_bgsh_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entry, + int32_t count); + +int32_t +unify_bgsh_ns_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entry, + int32_t count); + +/** + * unify_local_wipe - free all the extra allocation of local->* here. + */ +static void +unify_local_wipe (unify_local_t *local) +{ + /* Free the strdup'd variables in the local structure */ + if (local->name) { + FREE (local->name); + } + + if (local->sh_struct) { + if (local->sh_struct->offset_list) + FREE (local->sh_struct->offset_list); + + if (local->sh_struct->entry_list) + FREE (local->sh_struct->entry_list); + + if (local->sh_struct->count_list) + FREE (local->sh_struct->count_list); + + FREE (local->sh_struct); + } + + loc_wipe (&local->loc1); + loc_wipe (&local->loc2); +} + +int32_t +unify_sh_setdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = -1; + unify_local_t *local = frame->local; + inode_t *inode = NULL; + dict_t *tmp_dict = NULL; + dir_entry_t *prev, *entry, *trav; + + LOCK (&frame->lock); + { + /* if local->call_count == 0, that means, setdents on + * storagenodes is still pending. + */ + if (local->call_count) + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + + if (callcnt == 0) { + if (local->sh_struct->entry_list[0]) { + prev = entry = local->sh_struct->entry_list[0]; + if (!entry) + return 0; + trav = entry->next; + while (trav) { + prev->next = trav->next; + FREE (trav->name); + if (S_ISLNK (trav->buf.st_mode)) + FREE (trav->link); + FREE (trav); + trav = prev->next; + } + FREE (entry); + } + + if (!local->flags) { + if (local->sh_struct->count_list[0] >= + UNIFY_SELF_HEAL_GETDENTS_COUNT) { + /* count == size, that means, there are more entries + to read from */ + //local->call_count = 0; + local->sh_struct->offset_list[0] += + UNIFY_SELF_HEAL_GETDENTS_COUNT; + STACK_WIND (frame, + unify_sh_ns_getdents_cbk, + NS(this), + NS(this)->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + local->sh_struct->offset_list[0], + GF_GET_DIR_ONLY); + } + } else { + inode = local->loc1.inode; + fd_unref (local->fd); + tmp_dict = local->dict; + + unify_local_wipe (local); + + STACK_UNWIND (frame, local->op_ret, local->op_errno, + inode, &local->stbuf, local->dict); + if (tmp_dict) + dict_unref (local->dict); + } + } + + return 0; +} + + +int32_t +unify_sh_ns_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entry, + int32_t count) +{ + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + long index = 0; + unsigned long final = 0; + dir_entry_t *tmp = CALLOC (1, sizeof (dir_entry_t)); + + local->sh_struct->entry_list[0] = tmp; + local->sh_struct->count_list[0] = count; + if (entry) { + tmp->next = entry->next; + entry->next = NULL; + } + + if ((count < UNIFY_SELF_HEAL_GETDENTS_COUNT) || !entry) { + final = 1; + } + + LOCK (&frame->lock); + { + /* local->call_count will be '0' till now. make it 1 so, it + can be UNWIND'ed for the last call. */ + local->call_count = priv->child_count; + if (final) + local->flags = 1; + } + UNLOCK (&frame->lock); + + for (index = 0; index < priv->child_count; index++) + { + STACK_WIND_COOKIE (frame, + unify_sh_setdents_cbk, + (void *)index, + priv->xl_array[index], + priv->xl_array[index]->fops->setdents, + local->fd, GF_SET_DIR_ONLY, + local->sh_struct->entry_list[0], count); + } + + return 0; +} + +int32_t +unify_sh_ns_setdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = -1; + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + long index = (long)cookie; + dir_entry_t *prev, *entry, *trav; + + LOCK (&frame->lock); + { + if (local->sh_struct->entry_list[index]) { + prev = entry = local->sh_struct->entry_list[index]; + trav = entry->next; + while (trav) { + prev->next = trav->next; + FREE (trav->name); + if (S_ISLNK (trav->buf.st_mode)) + FREE (trav->link); + FREE (trav); + trav = prev->next; + } + FREE (entry); + } + } + UNLOCK (&frame->lock); + + if (local->sh_struct->count_list[index] < + UNIFY_SELF_HEAL_GETDENTS_COUNT) { + LOCK (&frame->lock); + { + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + } else { + /* count == size, that means, there are more entries + to read from */ + local->sh_struct->offset_list[index] += + UNIFY_SELF_HEAL_GETDENTS_COUNT; + STACK_WIND_COOKIE (frame, + unify_sh_getdents_cbk, + cookie, + priv->xl_array[index], + priv->xl_array[index]->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + local->sh_struct->offset_list[index], + GF_GET_ALL); + + gf_log (this->name, GF_LOG_DEBUG, + "readdir on (%s) with offset %"PRId64"", + priv->xl_array[index]->name, + local->sh_struct->offset_list[index]); + } + + if (!callcnt) { + /* All storage nodes have done unified setdents on NS node. + * Now, do getdents from NS and do setdents on storage nodes. + */ + + /* sh_struct->offset_list is no longer required for + storage nodes now */ + local->sh_struct->offset_list[0] = 0; /* reset */ + + STACK_WIND (frame, + unify_sh_ns_getdents_cbk, + NS(this), + NS(this)->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + 0, /* In this call, do send '0' as offset */ + GF_GET_DIR_ONLY); + } + + return 0; +} + + +/** + * unify_sh_getdents_cbk - + */ +int32_t +unify_sh_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entry, + int32_t count) +{ + int32_t callcnt = -1; + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + long index = (long)cookie; + dir_entry_t *tmp = NULL; + + if (op_ret >= 0 && count > 0) { + /* There is some dentry found, just send the dentry to NS */ + tmp = CALLOC (1, sizeof (dir_entry_t)); + local->sh_struct->entry_list[index] = tmp; + local->sh_struct->count_list[index] = count; + if (entry) { + tmp->next = entry->next; + entry->next = NULL; + } + STACK_WIND_COOKIE (frame, + unify_sh_ns_setdents_cbk, + cookie, + NS(this), + NS(this)->fops->setdents, + local->fd, + GF_SET_IF_NOT_PRESENT, + local->sh_struct->entry_list[index], + count); + return 0; + } + + if (count < UNIFY_SELF_HEAL_GETDENTS_COUNT) { + LOCK (&frame->lock); + { + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + } else { + /* count == size, that means, there are more entries + to read from */ + local->sh_struct->offset_list[index] += + UNIFY_SELF_HEAL_GETDENTS_COUNT; + STACK_WIND_COOKIE (frame, + unify_sh_getdents_cbk, + cookie, + priv->xl_array[index], + priv->xl_array[index]->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + local->sh_struct->offset_list[index], + GF_GET_ALL); + + gf_log (this->name, GF_LOG_DEBUG, + "readdir on (%s) with offset %"PRId64"", + priv->xl_array[index]->name, + local->sh_struct->offset_list[index]); + } + + if (!callcnt) { + /* All storage nodes have done unified setdents on NS node. + * Now, do getdents from NS and do setdents on storage nodes. + */ + + /* sh_struct->offset_list is no longer required for + storage nodes now */ + local->sh_struct->offset_list[0] = 0; /* reset */ + + STACK_WIND (frame, + unify_sh_ns_getdents_cbk, + NS(this), + NS(this)->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + 0, /* In this call, do send '0' as offset */ + GF_GET_DIR_ONLY); + } + + return 0; +} + +/** + * unify_sh_opendir_cbk - + * + * @cookie: + */ +int32_t +unify_sh_opendir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + int32_t callcnt = 0; + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + int16_t index = 0; + inode_t *inode = NULL; + dict_t *tmp_dict = NULL; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret >= 0) { + local->op_ret = op_ret; + } else { + gf_log (this->name, GF_LOG_WARNING, "failed"); + local->failed = 1; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + local->call_count = priv->child_count + 1; + + if (!local->failed) { + /* send getdents() namespace after finishing + storage nodes */ + local->call_count--; + + fd_bind (fd); + + if (local->call_count) { + /* Used as the offset index. This list keeps + * track of offset sent to each node during + * STACK_WIND. + */ + local->sh_struct->offset_list = + calloc (priv->child_count, + sizeof (off_t)); + ERR_ABORT (local->sh_struct->offset_list); + + local->sh_struct->entry_list = + calloc (priv->child_count, + sizeof (dir_entry_t *)); + ERR_ABORT (local->sh_struct->entry_list); + + local->sh_struct->count_list = + calloc (priv->child_count, + sizeof (int)); + ERR_ABORT (local->sh_struct->count_list); + + /* Send getdents on all the fds */ + for (index = 0; + index < priv->child_count; index++) { + STACK_WIND_COOKIE (frame, + unify_sh_getdents_cbk, + (void *)(long)index, + priv->xl_array[index], + priv->xl_array[index]->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + 0, /* In this call, do send '0' as offset */ + GF_GET_ALL); + } + + /* did stack wind, so no need to unwind here */ + return 0; + } /* (local->call_count) */ + } /* (!local->failed) */ + + /* Opendir failed on one node. */ + inode = local->loc1.inode; + fd_unref (local->fd); + tmp_dict = local->dict; + + unify_local_wipe (local); + /* Only 'self-heal' failed, lookup() was successful. */ + local->op_ret = 0; + + /* This is lookup_cbk ()'s UNWIND. */ + STACK_UNWIND (frame, local->op_ret, local->op_errno, inode, + &local->stbuf, local->dict); + if (tmp_dict) + dict_unref (tmp_dict); + } + + return 0; +} + +/** + * gf_sh_checksum_cbk - + * + * @frame: frame used in lookup. get a copy of it, and use that copy. + * @this: pointer to unify xlator. + * @inode: pointer to inode, for which the consistency check is required. + * + */ +int32_t +unify_sh_checksum_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + uint8_t *file_checksum, + uint8_t *dir_checksum) +{ + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + int16_t index = 0; + int32_t callcnt = 0; + inode_t *inode = NULL; + dict_t *tmp_dict = NULL; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret >= 0) { + if (NS(this) == (xlator_t *)cookie) { + memcpy (local->sh_struct->ns_file_checksum, + file_checksum, ZR_FILENAME_MAX); + memcpy (local->sh_struct->ns_dir_checksum, + dir_checksum, ZR_FILENAME_MAX); + } else { + if (local->entry_count == 0) { + /* Initialize the dir_checksum to be + * used for comparision with other + * storage nodes. Should be done for + * the first successful call *only*. + */ + /* Using 'entry_count' as a flag */ + local->entry_count = 1; + memcpy (local->sh_struct->dir_checksum, + dir_checksum, ZR_FILENAME_MAX); + } + + /* Reply from the storage nodes */ + for (index = 0; + index < ZR_FILENAME_MAX; index++) { + /* Files should be present in + only one node */ + local->sh_struct->file_checksum[index] ^= file_checksum[index]; + + /* directory structure should be + same accross */ + if (local->sh_struct->dir_checksum[index] != dir_checksum[index]) + local->failed = 1; + } + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + for (index = 0; index < ZR_FILENAME_MAX ; index++) { + if (local->sh_struct->file_checksum[index] != + local->sh_struct->ns_file_checksum[index]) { + local->failed = 1; + break; + } + if (local->sh_struct->dir_checksum[index] != + local->sh_struct->ns_dir_checksum[index]) { + local->failed = 1; + break; + } + } + + if (local->failed) { + /* Log it, it should be a rare event */ + gf_log (this->name, GF_LOG_WARNING, + "Self-heal triggered on directory %s", + local->loc1.path); + + /* Any self heal will be done at directory level */ + local->call_count = 0; + local->op_ret = -1; + local->failed = 0; + + local->fd = fd_create (local->loc1.inode, + frame->root->pid); + + local->call_count = priv->child_count + 1; + + for (index = 0; + index < (priv->child_count + 1); index++) { + STACK_WIND_COOKIE (frame, + unify_sh_opendir_cbk, + priv->xl_array[index]->name, + priv->xl_array[index], + priv->xl_array[index]->fops->opendir, + &local->loc1, + local->fd); + } + /* opendir can be done on the directory */ + return 0; + } + + /* no mismatch */ + inode = local->loc1.inode; + tmp_dict = local->dict; + + unify_local_wipe (local); + + /* This is lookup_cbk ()'s UNWIND. */ + STACK_UNWIND (frame, + local->op_ret, + local->op_errno, + inode, + &local->stbuf, + local->dict); + if (tmp_dict) + dict_unref (tmp_dict); + } + + return 0; +} + +/* Foreground self-heal part over */ + +/* Background self-heal part */ + +int32_t +unify_bgsh_setdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = -1; + unify_local_t *local = frame->local; + dir_entry_t *prev, *entry, *trav; + + LOCK (&frame->lock); + { + /* if local->call_count == 0, that means, setdents + on storagenodes is still pending. */ + if (local->call_count) + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + + + if (callcnt == 0) { + if (local->sh_struct->entry_list[0]) { + prev = entry = local->sh_struct->entry_list[0]; + trav = entry->next; + while (trav) { + prev->next = trav->next; + FREE (trav->name); + if (S_ISLNK (trav->buf.st_mode)) + FREE (trav->link); + FREE (trav); + trav = prev->next; + } + FREE (entry); + } + + if (!local->flags) { + if (local->sh_struct->count_list[0] >= + UNIFY_SELF_HEAL_GETDENTS_COUNT) { + /* count == size, that means, there are more + entries to read from */ + //local->call_count = 0; + local->sh_struct->offset_list[0] += + UNIFY_SELF_HEAL_GETDENTS_COUNT; + STACK_WIND (frame, + unify_bgsh_ns_getdents_cbk, + NS(this), + NS(this)->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + local->sh_struct->offset_list[0], + GF_GET_DIR_ONLY); + } + } else { + fd_unref (local->fd); + unify_local_wipe (local); + STACK_DESTROY (frame->root); + } + } + + return 0; +} + + +int32_t +unify_bgsh_ns_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entry, + int32_t count) +{ + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + long index = 0; + unsigned long final = 0; + dir_entry_t *tmp = CALLOC (1, sizeof (dir_entry_t)); + + local->sh_struct->entry_list[0] = tmp; + local->sh_struct->count_list[0] = count; + if (entry) { + tmp->next = entry->next; + entry->next = NULL; + } + + if ((count < UNIFY_SELF_HEAL_GETDENTS_COUNT) || !entry) { + final = 1; + } + + LOCK (&frame->lock); + { + /* local->call_count will be '0' till now. make it 1 so, + it can be UNWIND'ed for the last call. */ + local->call_count = priv->child_count; + if (final) + local->flags = 1; + } + UNLOCK (&frame->lock); + + for (index = 0; index < priv->child_count; index++) + { + STACK_WIND_COOKIE (frame, + unify_bgsh_setdents_cbk, + (void *)index, + priv->xl_array[index], + priv->xl_array[index]->fops->setdents, + local->fd, GF_SET_DIR_ONLY, + local->sh_struct->entry_list[0], count); + } + + return 0; +} + +int32_t +unify_bgsh_ns_setdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = -1; + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + long index = (long)cookie; + dir_entry_t *prev, *entry, *trav; + + if (local->sh_struct->entry_list[index]) { + prev = entry = local->sh_struct->entry_list[index]; + if (!entry) + return 0; + trav = entry->next; + while (trav) { + prev->next = trav->next; + FREE (trav->name); + if (S_ISLNK (trav->buf.st_mode)) + FREE (trav->link); + FREE (trav); + trav = prev->next; + } + FREE (entry); + } + + if (local->sh_struct->count_list[index] < + UNIFY_SELF_HEAL_GETDENTS_COUNT) { + LOCK (&frame->lock); + { + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + } else { + /* count == size, that means, there are more entries + to read from */ + local->sh_struct->offset_list[index] += + UNIFY_SELF_HEAL_GETDENTS_COUNT; + STACK_WIND_COOKIE (frame, + unify_bgsh_getdents_cbk, + cookie, + priv->xl_array[index], + priv->xl_array[index]->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + local->sh_struct->offset_list[index], + GF_GET_ALL); + + gf_log (this->name, GF_LOG_DEBUG, + "readdir on (%s) with offset %"PRId64"", + priv->xl_array[index]->name, + local->sh_struct->offset_list[index]); + } + + if (!callcnt) { + /* All storage nodes have done unified setdents on NS node. + * Now, do getdents from NS and do setdents on storage nodes. + */ + + /* sh_struct->offset_list is no longer required for + storage nodes now */ + local->sh_struct->offset_list[0] = 0; /* reset */ + + STACK_WIND (frame, + unify_bgsh_ns_getdents_cbk, + NS(this), + NS(this)->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + 0, /* In this call, do send '0' as offset */ + GF_GET_DIR_ONLY); + } + + return 0; +} + + +/** + * unify_bgsh_getdents_cbk - + */ +int32_t +unify_bgsh_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entry, + int32_t count) +{ + int32_t callcnt = -1; + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + long index = (long)cookie; + dir_entry_t *tmp = NULL; + + if (op_ret >= 0 && count > 0) { + /* There is some dentry found, just send the dentry to NS */ + tmp = CALLOC (1, sizeof (dir_entry_t)); + local->sh_struct->entry_list[index] = tmp; + local->sh_struct->count_list[index] = count; + if (entry) { + tmp->next = entry->next; + entry->next = NULL; + } + STACK_WIND_COOKIE (frame, + unify_bgsh_ns_setdents_cbk, + cookie, + NS(this), + NS(this)->fops->setdents, + local->fd, + GF_SET_IF_NOT_PRESENT, + local->sh_struct->entry_list[index], + count); + return 0; + } + + if (count < UNIFY_SELF_HEAL_GETDENTS_COUNT) { + LOCK (&frame->lock); + { + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + } else { + /* count == size, that means, there are more entries to read from */ + local->sh_struct->offset_list[index] += + UNIFY_SELF_HEAL_GETDENTS_COUNT; + + STACK_WIND_COOKIE (frame, + unify_bgsh_getdents_cbk, + cookie, + priv->xl_array[index], + priv->xl_array[index]->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + local->sh_struct->offset_list[index], + GF_GET_ALL); + + gf_log (this->name, GF_LOG_DEBUG, + "readdir on (%s) with offset %"PRId64"", + priv->xl_array[index]->name, + local->sh_struct->offset_list[index]); + } + + if (!callcnt) { + /* All storage nodes have done unified setdents on NS node. + * Now, do getdents from NS and do setdents on storage nodes. + */ + + /* sh_struct->offset_list is no longer required for + storage nodes now */ + local->sh_struct->offset_list[0] = 0; /* reset */ + + STACK_WIND (frame, + unify_bgsh_ns_getdents_cbk, + NS(this), + NS(this)->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + 0, /* In this call, do send '0' as offset */ + GF_GET_DIR_ONLY); + } + + return 0; +} + +/** + * unify_bgsh_opendir_cbk - + * + * @cookie: + */ +int32_t +unify_bgsh_opendir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + int32_t callcnt = 0; + int16_t index = 0; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret >= 0) { + local->op_ret = op_ret; + } else { + local->failed = 1; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + local->call_count = priv->child_count + 1; + + if (!local->failed) { + /* send getdents() namespace after finishing + storage nodes */ + local->call_count--; + callcnt = local->call_count; + + fd_bind (fd); + + if (local->call_count) { + /* Used as the offset index. This list keeps + track of offset sent to each node during + STACK_WIND. */ + local->sh_struct->offset_list = + calloc (priv->child_count, + sizeof (off_t)); + ERR_ABORT (local->sh_struct->offset_list); + + local->sh_struct->entry_list = + calloc (priv->child_count, + sizeof (dir_entry_t *)); + ERR_ABORT (local->sh_struct->entry_list); + + local->sh_struct->count_list = + calloc (priv->child_count, + sizeof (int)); + ERR_ABORT (local->sh_struct->count_list); + + /* Send getdents on all the fds */ + for (index = 0; + index < priv->child_count; index++) { + STACK_WIND_COOKIE (frame, + unify_bgsh_getdents_cbk, + (void *)(long)index, + priv->xl_array[index], + priv->xl_array[index]->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + 0, /* In this call, do send '0' as offset */ + GF_GET_ALL); + } + /* did a stack wind, so no need to unwind here */ + return 0; + } /* (local->call_count) */ + } /* (!local->failed) */ + + /* Opendir failed on one node. */ + fd_unref (local->fd); + + unify_local_wipe (local); + STACK_DESTROY (frame->root); + } + + return 0; +} + +/** + * gf_bgsh_checksum_cbk - + * + * @frame: frame used in lookup. get a copy of it, and use that copy. + * @this: pointer to unify xlator. + * @inode: pointer to inode, for which the consistency check is required. + * + */ +int32_t +unify_bgsh_checksum_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + uint8_t *file_checksum, + uint8_t *dir_checksum) +{ + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + int16_t index = 0; + int32_t callcnt = 0; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret >= 0) { + if (NS(this) == (xlator_t *)cookie) { + memcpy (local->sh_struct->ns_file_checksum, + file_checksum, ZR_FILENAME_MAX); + memcpy (local->sh_struct->ns_dir_checksum, + dir_checksum, ZR_FILENAME_MAX); + } else { + if (local->entry_count == 0) { + /* Initialize the dir_checksum to be + * used for comparision with other + * storage nodes. Should be done for + * the first successful call *only*. + */ + /* Using 'entry_count' as a flag */ + local->entry_count = 1; + memcpy (local->sh_struct->dir_checksum, + dir_checksum, ZR_FILENAME_MAX); + } + + /* Reply from the storage nodes */ + for (index = 0; + index < ZR_FILENAME_MAX; index++) { + /* Files should be present in only + one node */ + local->sh_struct->file_checksum[index] ^= file_checksum[index]; + + /* directory structure should be same + accross */ + if (local->sh_struct->dir_checksum[index] != dir_checksum[index]) + local->failed = 1; + } + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + for (index = 0; index < ZR_FILENAME_MAX ; index++) { + if (local->sh_struct->file_checksum[index] != + local->sh_struct->ns_file_checksum[index]) { + local->failed = 1; + break; + } + if (local->sh_struct->dir_checksum[index] != + local->sh_struct->ns_dir_checksum[index]) { + local->failed = 1; + break; + } + } + + if (local->failed) { + /* Log it, it should be a rare event */ + gf_log (this->name, GF_LOG_WARNING, + "Self-heal triggered on directory %s", + local->loc1.path); + + /* Any self heal will be done at the directory level */ + local->op_ret = -1; + local->failed = 0; + + local->fd = fd_create (local->loc1.inode, + frame->root->pid); + local->call_count = priv->child_count + 1; + + for (index = 0; + index < (priv->child_count + 1); index++) { + STACK_WIND_COOKIE (frame, + unify_bgsh_opendir_cbk, + priv->xl_array[index]->name, + priv->xl_array[index], + priv->xl_array[index]->fops->opendir, + &local->loc1, + local->fd); + } + + /* opendir can be done on the directory */ + return 0; + } + + /* no mismatch */ + unify_local_wipe (local); + STACK_DESTROY (frame->root); + } + + return 0; +} + +/* Background self-heal part over */ + + + + +/** + * zr_unify_self_heal - + * + * @frame: frame used in lookup. get a copy of it, and use that copy. + * @this: pointer to unify xlator. + * @inode: pointer to inode, for which the consistency check is required. + * + */ +int32_t +zr_unify_self_heal (call_frame_t *frame, + xlator_t *this, + unify_local_t *local) +{ + unify_private_t *priv = this->private; + call_frame_t *bg_frame = NULL; + unify_local_t *bg_local = NULL; + inode_t *tmp_inode = NULL; + dict_t *tmp_dict = NULL; + int16_t index = 0; + + if (local->inode_generation < priv->inode_generation) { + /* Any self heal will be done at the directory level */ + /* Update the inode's generation to the current generation + value. */ + local->inode_generation = priv->inode_generation; + inode_ctx_put (local->loc1.inode, this, + (uint64_t)(long)local->inode_generation); + + if (priv->self_heal == ZR_UNIFY_FG_SELF_HEAL) { + local->op_ret = 0; + local->failed = 0; + local->call_count = priv->child_count + 1; + local->sh_struct = + calloc (1, sizeof (struct unify_self_heal_struct)); + + /* +1 is for NS */ + for (index = 0; + index < (priv->child_count + 1); index++) { + STACK_WIND_COOKIE (frame, + unify_sh_checksum_cbk, + priv->xl_array[index], + priv->xl_array[index], + priv->xl_array[index]->fops->checksum, + &local->loc1, + 0); + } + + /* Self-heal in foreground, hence no need + to UNWIND here */ + return 0; + } + + /* Self Heal done in background */ + bg_frame = copy_frame (frame); + INIT_LOCAL (bg_frame, bg_local); + loc_copy (&bg_local->loc1, &local->loc1); + bg_local->op_ret = 0; + bg_local->failed = 0; + bg_local->call_count = priv->child_count + 1; + bg_local->sh_struct = + calloc (1, sizeof (struct unify_self_heal_struct)); + + /* +1 is for NS */ + for (index = 0; index < (priv->child_count + 1); index++) { + STACK_WIND_COOKIE (bg_frame, + unify_bgsh_checksum_cbk, + priv->xl_array[index], + priv->xl_array[index], + priv->xl_array[index]->fops->checksum, + &bg_local->loc1, + 0); + } + } + + /* generation number matches, self heal already done or + * self heal done in background: just do STACK_UNWIND + */ + tmp_inode = local->loc1.inode; + tmp_dict = local->dict; + + unify_local_wipe (local); + + /* This is lookup_cbk ()'s UNWIND. */ + STACK_UNWIND (frame, + local->op_ret, + local->op_errno, + tmp_inode, + &local->stbuf, + local->dict); + + if (tmp_dict) + dict_unref (tmp_dict); + + return 0; +} + diff --git a/xlators/cluster/unify/src/unify.c b/xlators/cluster/unify/src/unify.c new file mode 100644 index 000000000..e2a5e14b1 --- /dev/null +++ b/xlators/cluster/unify/src/unify.c @@ -0,0 +1,4451 @@ +/* + Copyright (c) 2006, 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +/** + * xlators/cluster/unify: + * - This xlator is one of the main translator in GlusterFS, which + * actually does the clustering work of the file system. One need to + * understand that, unify assumes file to be existing in only one of + * the child node, and directories to be present on all the nodes. + * + * NOTE: + * Now, unify has support for global namespace, which is used to keep a + * global view of fs's namespace tree. The stat for directories are taken + * just from the namespace, where as for files, just 'st_ino' is taken from + * Namespace node, and other stat info is taken from the actual storage node. + * Also Namespace node helps to keep consistant inode for files across + * glusterfs (re-)mounts. + */ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "unify.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "defaults.h" +#include "common-utils.h" +#include <signal.h> +#include <libgen.h> +#include "compat-errno.h" +#include "compat.h" + +#define UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR(_loc) do { \ + if (!(_loc && _loc->inode)) { \ + STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL); \ + return 0; \ + } \ +} while(0) + + +#define UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR(_fd) do { \ + if (!(_fd && !fd_ctx_get (_fd, this, NULL))) { \ + STACK_UNWIND (frame, -1, EBADFD, NULL, NULL); \ + return 0; \ + } \ +} while(0) + +#define UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(_fd) do { \ + if (!_fd) { \ + STACK_UNWIND (frame, -1, EBADFD, NULL, NULL); \ + return 0; \ + } \ +} while(0) + +/** + * unify_local_wipe - free all the extra allocation of local->* here. + */ +static void +unify_local_wipe (unify_local_t *local) +{ + /* Free the strdup'd variables in the local structure */ + if (local->name) { + FREE (local->name); + } + loc_wipe (&local->loc1); + loc_wipe (&local->loc2); +} + + + +/* + * unify_normalize_stats - + */ +void +unify_normalize_stats (struct statvfs *buf, + unsigned long bsize, + unsigned long frsize) +{ + double factor; + + if (buf->f_bsize != bsize) { + factor = ((double) buf->f_bsize) / bsize; + buf->f_bsize = bsize; + buf->f_bfree = (fsblkcnt_t) (factor * buf->f_bfree); + buf->f_bavail = (fsblkcnt_t) (factor * buf->f_bavail); + } + + if (buf->f_frsize != frsize) { + factor = ((double) buf->f_frsize) / frsize; + buf->f_frsize = frsize; + buf->f_blocks = (fsblkcnt_t) (factor * buf->f_blocks); + } +} + + +xlator_t * +unify_loc_subvol (loc_t *loc, xlator_t *this) +{ + unify_private_t *priv = NULL; + xlator_t *subvol = NULL; + int16_t *list = NULL; + long index = 0; + xlator_t *subvol_i = NULL; + int ret = 0; + uint64_t tmp_list = 0; + + priv = this->private; + subvol = NS (this); + + if (!S_ISDIR (loc->inode->st_mode)) { + ret = inode_ctx_get (loc->inode, this, &tmp_list); + list = (int16_t *)(long)tmp_list; + if (!list) + goto out; + + for (index = 0; list[index] != -1; index++) { + subvol_i = priv->xl_array[list[index]]; + if (subvol_i != NS (this)) { + subvol = subvol_i; + break; + } + } + } +out: + return subvol; +} + + + +/** + * unify_statfs_cbk - + */ +int32_t +unify_statfs_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct statvfs *stbuf) +{ + int32_t callcnt = 0; + struct statvfs *dict_buf = NULL; + unsigned long bsize; + unsigned long frsize; + unify_local_t *local = (unify_local_t *)frame->local; + call_frame_t *prev_frame = cookie; + + LOCK (&frame->lock); + { + if (op_ret >= 0) { + /* when a call is successfull, add it to local->dict */ + dict_buf = &local->statvfs_buf; + + if (dict_buf->f_bsize != 0) { + bsize = max (dict_buf->f_bsize, + stbuf->f_bsize); + + frsize = max (dict_buf->f_frsize, + stbuf->f_frsize); + unify_normalize_stats(dict_buf, bsize, frsize); + unify_normalize_stats(stbuf, bsize, frsize); + } else { + dict_buf->f_bsize = stbuf->f_bsize; + dict_buf->f_frsize = stbuf->f_frsize; + } + + dict_buf->f_blocks += stbuf->f_blocks; + dict_buf->f_bfree += stbuf->f_bfree; + dict_buf->f_bavail += stbuf->f_bavail; + dict_buf->f_files += stbuf->f_files; + dict_buf->f_ffree += stbuf->f_ffree; + dict_buf->f_favail += stbuf->f_favail; + dict_buf->f_fsid = stbuf->f_fsid; + dict_buf->f_flag = stbuf->f_flag; + dict_buf->f_namemax = stbuf->f_namemax; + local->op_ret = op_ret; + } else { + /* fop on storage node has failed due to some error */ + if (op_errno != ENOTCONN) { + gf_log (this->name, GF_LOG_ERROR, + "child(%s): %s", + prev_frame->this->name, + strerror (op_errno)); + } + local->op_errno = op_errno; + } + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->statvfs_buf); + } + + return 0; +} + +/** + * unify_statfs - + */ +int32_t +unify_statfs (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + unify_local_t *local = NULL; + xlator_list_t *trav = this->children; + + INIT_LOCAL (frame, local); + local->call_count = ((unify_private_t *)this->private)->child_count; + + while(trav) { + STACK_WIND (frame, + unify_statfs_cbk, + trav->xlator, + trav->xlator->fops->statfs, + loc); + trav = trav->next; + } + + return 0; +} + +/** + * unify_buf_cbk - + */ +int32_t +unify_buf_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int32_t callcnt = 0; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + call_frame_t *prev_frame = cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "%s(): child(%s): path(%s): %s", + gf_fop_list[frame->root->op], + prev_frame->this->name, + (local->loc1.path)?local->loc1.path:"", + strerror (op_errno)); + + local->op_errno = op_errno; + if ((op_errno == ENOENT) && priv->optimist) + local->op_ret = 0; + } + + if (op_ret >= 0) { + local->op_ret = 0; + + if (NS (this) == prev_frame->this) { + local->st_ino = buf->st_ino; + /* If the entry is directory, get the stat + from NS node */ + if (S_ISDIR (buf->st_mode) || + !local->stbuf.st_blksize) { + local->stbuf = *buf; + } + } + + if ((!S_ISDIR (buf->st_mode)) && + (NS (this) != prev_frame->this)) { + /* If file, take the stat info from Storage + node. */ + local->stbuf = *buf; + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + /* If the inode number is not filled, operation should + fail */ + if (!local->st_ino) + local->op_ret = -1; + + local->stbuf.st_ino = local->st_ino; + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + } + + return 0; +} + +#define check_if_dht_linkfile(s) ((s->st_mode & ~S_IFMT) == S_ISVTX) + +/** + * unify_lookup_cbk - + */ +int32_t +unify_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + int32_t callcnt = 0; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + inode_t *tmp_inode = NULL; + dict_t *local_dict = NULL; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + if ((op_errno != ENOTCONN) && (op_errno != ENOENT)) { + gf_log (this->name, GF_LOG_ERROR, + "child(%s): path(%s): %s", + priv->xl_array[(long)cookie]->name, + local->loc1.path, strerror (op_errno)); + local->op_errno = op_errno; + local->failed = 1; + + } else if (local->revalidate && + !(priv->optimist && (op_errno == ENOENT))) { + + gf_log (this->name, + (op_errno == ENOTCONN) ? + GF_LOG_DEBUG:GF_LOG_ERROR, + "child(%s): path(%s): %s", + priv->xl_array[(long)cookie]->name, + local->loc1.path, strerror (op_errno)); + local->op_errno = op_errno; + local->failed = 1; + } + } + + if (op_ret == 0) { + local->op_ret = 0; + + if (check_if_dht_linkfile(buf)) { + gf_log (this->name, GF_LOG_CRITICAL, + "file %s may be DHT link file on %s, " + "make sure the backend is not shared " + "between unify and DHT", + local->loc1.path, + priv->xl_array[(long)cookie]->name); + } + + if (local->stbuf.st_mode && local->stbuf.st_blksize) { + /* make sure we already have a stbuf + stored in local->stbuf */ + if (S_ISDIR (local->stbuf.st_mode) && + !S_ISDIR (buf->st_mode)) { + gf_log (this->name, GF_LOG_CRITICAL, + "[CRITICAL] '%s' is directory " + "on namespace, non-directory " + "on node '%s', returning EIO", + local->loc1.path, + priv->xl_array[(long)cookie]->name); + local->return_eio = 1; + } + if (!S_ISDIR (local->stbuf.st_mode) && + S_ISDIR (buf->st_mode)) { + gf_log (this->name, GF_LOG_CRITICAL, + "[CRITICAL] '%s' is directory " + "on node '%s', non-directory " + "on namespace, returning EIO", + local->loc1.path, + priv->xl_array[(long)cookie]->name); + local->return_eio = 1; + } + } + + if (!local->revalidate && !S_ISDIR (buf->st_mode)) { + /* This is the first time lookup on file*/ + if (!local->list) { + /* list is not allocated, allocate + the max possible range */ + local->list = CALLOC (1, 2 * (priv->child_count + 2)); + if (!local->list) { + gf_log (this->name, + GF_LOG_CRITICAL, + "Not enough memory"); + STACK_UNWIND (frame, -1, + ENOMEM, inode, + NULL, NULL); + return 0; + } + } + /* update the index of the list */ + local->list [local->index++] = + (int16_t)(long)cookie; + } + + if ((!local->dict) && dict && + (priv->xl_array[(long)cookie] != NS(this))) { + local->dict = dict_ref (dict); + } + + /* index of NS node is == total child count */ + if (priv->child_count == (int16_t)(long)cookie) { + /* Take the inode number from namespace */ + local->st_ino = buf->st_ino; + if (S_ISDIR (buf->st_mode) || + !(local->stbuf.st_blksize)) { + local->stbuf = *buf; + } + } else if (!S_ISDIR (buf->st_mode)) { + /* If file, then get the stat from + storage node */ + local->stbuf = *buf; + } + + if (local->st_nlink < buf->st_nlink) { + local->st_nlink = buf->st_nlink; + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + local_dict = local->dict; + if (local->return_eio) { + gf_log (this->name, GF_LOG_CRITICAL, + "[CRITICAL] Unable to fix the path (%s) with " + "self-heal, try manual verification. " + "returning EIO.", local->loc1.path); + unify_local_wipe (local); + STACK_UNWIND (frame, -1, EIO, inode, NULL, NULL); + if (local_dict) { + dict_unref (local_dict); + } + return 0; + } + + if (!local->stbuf.st_blksize) { + /* Inode not present */ + local->op_ret = -1; + } else { + if (!local->revalidate && + !S_ISDIR (local->stbuf.st_mode)) { + /* If its a file, big array is useless, + allocate the smaller one */ + int16_t *list = NULL; + list = CALLOC (1, 2 * (local->index + 1)); + ERR_ABORT (list); + memcpy (list, local->list, 2 * local->index); + /* Make the end of the list as -1 */ + FREE (local->list); + local->list = list; + local->list [local->index] = -1; + /* Update the inode's ctx with proper array */ + /* TODO: log on failure */ + inode_ctx_put (local->loc1.inode, this, + (uint64_t)(long)local->list); + } + + if (S_ISDIR(local->loc1.inode->st_mode)) { + /* lookup is done for directory */ + if (local->failed && priv->self_heal) { + /* Triggering self-heal */ + /* means, self-heal required for this + inode */ + local->inode_generation = 0; + priv->inode_generation++; + } + } else { + local->stbuf.st_ino = local->st_ino; + } + + local->stbuf.st_nlink = local->st_nlink; + } + if (local->op_ret == -1) { + if (!local->revalidate && local->list) + FREE (local->list); + } + + if ((local->op_ret >= 0) && local->failed && + local->revalidate) { + /* Done revalidate, but it failed */ + if (op_errno != ENOTCONN) { + gf_log (this->name, GF_LOG_ERROR, + "Revalidate failed for path(%s): %s", + local->loc1.path, strerror (op_errno)); + } + local->op_ret = -1; + } + + if ((priv->self_heal && !priv->optimist) && + (!local->revalidate && (local->op_ret == 0) && + S_ISDIR(local->stbuf.st_mode))) { + /* Let the self heal be done here */ + zr_unify_self_heal (frame, this, local); + local_dict = NULL; + } else { + /* either no self heal, or op_ret == -1 (failure) */ + tmp_inode = local->loc1.inode; + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + tmp_inode, &local->stbuf, local->dict); + } + if (local_dict) { + dict_unref (local_dict); + } + } + + return 0; +} + +/** + * unify_lookup - + */ +int32_t +unify_lookup (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req) +{ + unify_local_t *local = NULL; + unify_private_t *priv = this->private; + int16_t *list = NULL; + long index = 0; + + if (!(loc && loc->inode)) { + gf_log (this->name, GF_LOG_ERROR, + "%s: Argument not right", loc?loc->path:"(null)"); + STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL); + return 0; + } + + /* Initialization */ + INIT_LOCAL (frame, local); + loc_copy (&local->loc1, loc); + if (local->loc1.path == NULL) { + gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); + STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL, NULL); + return 0; + } + + if (!inode_ctx_get (loc->inode, this, NULL) && + loc->inode->st_mode && + !S_ISDIR (loc->inode->st_mode)) { + uint64_t tmp_list = 0; + /* check if revalidate or fresh lookup */ + inode_ctx_get (loc->inode, this, &tmp_list); + local->list = (int16_t *)(long)tmp_list; + } + + if (local->list) { + list = local->list; + for (index = 0; list[index] != -1; index++); + if (index != 2) { + if (index < 2) { + gf_log (this->name, GF_LOG_ERROR, + "returning ESTALE for %s: file " + "count is %ld", loc->path, index); + /* Print where all the file is present */ + for (index = 0; + local->list[index] != -1; index++) { + gf_log (this->name, GF_LOG_ERROR, + "%s: found on %s", loc->path, + priv->xl_array[list[index]]->name); + } + unify_local_wipe (local); + STACK_UNWIND (frame, -1, ESTALE, + NULL, NULL, NULL); + return 0; + } else { + /* There are more than 2 presences */ + /* Just log and continue */ + gf_log (this->name, GF_LOG_ERROR, + "%s: file count is %ld", + loc->path, index); + /* Print where all the file is present */ + for (index = 0; + local->list[index] != -1; index++) { + gf_log (this->name, GF_LOG_ERROR, + "%s: found on %s", loc->path, + priv->xl_array[list[index]]->name); + } + } + } + + /* is revalidate */ + local->revalidate = 1; + + for (index = 0; list[index] != -1; index++) + local->call_count++; + + for (index = 0; list[index] != -1; index++) { + char need_break = (list[index+1] == -1); + STACK_WIND_COOKIE (frame, + unify_lookup_cbk, + (void *)(long)list[index], //cookie + priv->xl_array [list[index]], + priv->xl_array [list[index]]->fops->lookup, + loc, + xattr_req); + if (need_break) + break; + } + } else { + if (loc->inode->st_mode) { + if (inode_ctx_get (loc->inode, this, NULL)) { + inode_ctx_get (loc->inode, this, + &local->inode_generation); + } + } + /* This is first call, there is no list */ + /* call count should be all child + 1 namespace */ + local->call_count = priv->child_count + 1; + + for (index = 0; index <= priv->child_count; index++) { + STACK_WIND_COOKIE (frame, + unify_lookup_cbk, + (void *)index, //cookie + priv->xl_array[index], + priv->xl_array[index]->fops->lookup, + loc, + xattr_req); + } + } + + return 0; +} + +/** + * unify_stat - if directory, get the stat directly from NameSpace child. + * if file, check for a hint and send it only there (also to NS). + * if its a fresh stat, then do it on all the nodes. + * + * NOTE: for all the call, sending cookie as xlator pointer, which will be + * used in cbk. + */ +int32_t +unify_stat (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + unify_local_t *local = NULL; + unify_private_t *priv = this->private; + int16_t index = 0; + int16_t *list = NULL; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* Initialization */ + INIT_LOCAL (frame, local); + loc_copy (&local->loc1, loc); + if (local->loc1.path == NULL) { + gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); + STACK_UNWIND (frame, -1, ENOMEM, NULL); + return 0; + } + local->st_ino = loc->inode->ino; + if (S_ISDIR (loc->inode->st_mode)) { + /* Directory */ + local->call_count = 1; + STACK_WIND (frame, unify_buf_cbk, NS(this), + NS(this)->fops->stat, loc); + } else { + /* File */ + inode_ctx_get (loc->inode, this, &tmp_list); + list = (int16_t *)(long)tmp_list; + + for (index = 0; list[index] != -1; index++) + local->call_count++; + + for (index = 0; list[index] != -1; index++) { + char need_break = (list[index+1] == -1); + STACK_WIND (frame, + unify_buf_cbk, + priv->xl_array[list[index]], + priv->xl_array[list[index]]->fops->stat, + loc); + if (need_break) + break; + } + } + + return 0; +} + +/** + * unify_access_cbk - + */ +int32_t +unify_access_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +/** + * unify_access - Send request to only namespace, which has all the + * attributes set for the file. + */ +int32_t +unify_access (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t mask) +{ + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + STACK_WIND (frame, + unify_access_cbk, + NS(this), + NS(this)->fops->access, + loc, + mask); + + return 0; +} + +int32_t +unify_mkdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + int32_t callcnt = 0; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + inode_t *tmp_inode = NULL; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if ((op_ret == -1) && !(priv->optimist && + (op_errno == ENOENT || + op_errno == EEXIST))) { + /* TODO: Decrement the inode_generation of + * this->inode's parent inode, hence the missing + * directory is created properly by self-heal. + * Currently, there is no way to get the parent + * inode directly. + */ + gf_log (this->name, GF_LOG_ERROR, + "child(%s): path(%s): %s", + priv->xl_array[(long)cookie]->name, + local->loc1.path, strerror (op_errno)); + if (op_errno != EEXIST) + local->failed = 1; + local->op_errno = op_errno; + } + + if (op_ret >= 0) + local->op_ret = 0; + + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (!local->failed) { + inode_ctx_put (local->loc1.inode, this, + priv->inode_generation); + } + + tmp_inode = local->loc1.inode; + unify_local_wipe (local); + + STACK_UNWIND (frame, local->op_ret, local->op_errno, + tmp_inode, &local->stbuf); + } + + return 0; +} + +/** + * unify_ns_mkdir_cbk - + */ +int32_t +unify_ns_mkdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + long index = 0; + + if (op_ret == -1) { + /* No need to send mkdir request to other servers, + * as namespace action failed + */ + gf_log (this->name, GF_LOG_ERROR, + "namespace: path(%s): %s", + local->name, strerror (op_errno)); + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno, inode, NULL); + return 0; + } + + /* Create one inode for this entry */ + local->op_ret = 0; + local->stbuf = *buf; + + local->call_count = priv->child_count; + + /* Send mkdir request to all the nodes now */ + for (index = 0; index < priv->child_count; index++) { + STACK_WIND_COOKIE (frame, + unify_mkdir_cbk, + (void *)index, //cookie + priv->xl_array[index], + priv->xl_array[index]->fops->mkdir, + &local->loc1, + local->mode); + } + + return 0; +} + + +/** + * unify_mkdir - + */ +int32_t +unify_mkdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + unify_local_t *local = NULL; + + /* Initialization */ + INIT_LOCAL (frame, local); + local->mode = mode; + + loc_copy (&local->loc1, loc); + + if (local->loc1.path == NULL) { + gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); + STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL); + return 0; + } + + STACK_WIND (frame, + unify_ns_mkdir_cbk, + NS(this), + NS(this)->fops->mkdir, + loc, + mode); + return 0; +} + +/** + * unify_rmdir_cbk - + */ +int32_t +unify_rmdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = 0; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == 0 || (priv->optimist && (op_errno == ENOENT))) + local->op_ret = 0; + if (op_ret == -1) + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno); + } + + return 0; +} + +/** + * unify_ns_rmdir_cbk - + */ +int32_t +unify_ns_rmdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int16_t index = 0; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + + if (op_ret == -1) { + /* No need to send rmdir request to other servers, + * as namespace action failed + */ + gf_log (this->name, + ((op_errno != ENOTEMPTY) ? + GF_LOG_ERROR : GF_LOG_DEBUG), + "namespace: path(%s): %s", + local->loc1.path, strerror (op_errno)); + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno); + return 0; + } + + local->call_count = priv->child_count; + + for (index = 0; index < priv->child_count; index++) { + STACK_WIND (frame, + unify_rmdir_cbk, + priv->xl_array[index], + priv->xl_array[index]->fops->rmdir, + &local->loc1); + } + + return 0; +} + +/** + * unify_rmdir - + */ +int32_t +unify_rmdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + unify_local_t *local = NULL; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* Initialization */ + INIT_LOCAL (frame, local); + + loc_copy (&local->loc1, loc); + if (local->loc1.path == NULL) { + gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); + STACK_UNWIND (frame, -1, ENOMEM); + return 0; + } + + STACK_WIND (frame, + unify_ns_rmdir_cbk, + NS(this), + NS(this)->fops->rmdir, + loc); + + return 0; +} + +/** + * unify_open_cbk - + */ +int32_t +unify_open_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + int32_t callcnt = 0; + unify_local_t *local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret >= 0) { + local->op_ret = op_ret; + if (NS(this) != (xlator_t *)cookie) { + /* Store child node's ptr, used in + all the f*** / FileIO calls */ + fd_ctx_set (fd, this, (uint64_t)(long)cookie); + } + } + if (op_ret == -1) { + local->op_errno = op_errno; + local->failed = 1; + } + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if ((local->failed == 1) && (local->op_ret >= 0)) { + local->call_count = 1; + /* return -1 to user */ + local->op_ret = -1; + //local->op_errno = EIO; + + if (!fd_ctx_get (local->fd, this, NULL)) { + gf_log (this->name, GF_LOG_ERROR, + "Open success on child node, " + "failed on namespace"); + } else { + gf_log (this->name, GF_LOG_ERROR, + "Open success on namespace, " + "failed on child node"); + } + } + + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, + local->op_errno, local->fd); + } + + return 0; +} + +#ifdef GF_DARWIN_HOST_OS +/** + * unify_create_lookup_cbk - + */ +int32_t +unify_open_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + int32_t callcnt = 0; + int16_t index = 0; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if ((op_ret == -1) && (op_errno != ENOENT)) { + gf_log (this->name, GF_LOG_ERROR, + "child(%s): path(%s): %s", + priv->xl_array[(long)cookie]->name, + local->loc1.path, strerror (op_errno)); + local->op_errno = op_errno; + } + + if (op_ret >= 0) { + local->op_ret = op_ret; + local->index++; + if (NS(this) == priv->xl_array[(long)cookie]) { + local->list[0] = (int16_t)(long)cookie; + } else { + local->list[1] = (int16_t)(long)cookie; + } + if (S_ISDIR (buf->st_mode)) + local->failed = 1; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + int16_t file_list[3] = {0,}; + local->op_ret = -1; + + file_list[0] = local->list[0]; + file_list[1] = local->list[1]; + file_list[2] = -1; + + if (local->index != 2) { + /* Lookup failed, can't do open */ + gf_log (this->name, GF_LOG_ERROR, + "%s: present on %d nodes", + local->name, local->index); + + if (local->index < 2) { + unify_local_wipe (local); + gf_log (this->name, GF_LOG_ERROR, + "returning as file found on less " + "than 2 nodes"); + STACK_UNWIND (frame, local->op_ret, + local->op_errno, local->fd); + return 0; + } + } + + if (local->failed) { + /* Open on directory, return EISDIR */ + unify_local_wipe (local); + STACK_UNWIND (frame, -1, EISDIR, local->fd); + return 0; + } + + /* Everything is perfect :) */ + local->call_count = 2; + + for (index = 0; file_list[index] != -1; index++) { + char need_break = (file_list[index+1] == -1); + STACK_WIND_COOKIE (frame, + unify_open_cbk, + priv->xl_array[file_list[index]], + priv->xl_array[file_list[index]], + priv->xl_array[file_list[index]]->fops->open, + &local->loc1, + local->flags, + local->fd); + if (need_break) + break; + } + } + + return 0; +} + + +int32_t +unify_open_readlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + const char *path) +{ + int16_t index = 0; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + + if (op_ret == -1) { + STACK_UNWIND (frame, -1, ENOENT); + return 0; + } + + if (path[0] == '/') { + local->name = strdup (path); + ERR_ABORT (local->name); + } else { + char *tmp_str = strdup (local->loc1.path); + char *tmp_base = dirname (tmp_str); + local->name = CALLOC (1, ZR_PATH_MAX); + strcpy (local->name, tmp_base); + strncat (local->name, "/", 1); + strcat (local->name, path); + FREE (tmp_str); + } + + local->list = CALLOC (1, sizeof (int16_t) * 3); + ERR_ABORT (local->list); + local->call_count = priv->child_count + 1; + local->op_ret = -1; + for (index = 0; index <= priv->child_count; index++) { + /* Send the lookup to all the nodes including namespace */ + STACK_WIND_COOKIE (frame, + unify_open_lookup_cbk, + (void *)(long)index, + priv->xl_array[index], + priv->xl_array[index]->fops->lookup, + &local->loc1, + NULL); + } + + return 0; +} +#endif /* GF_DARWIN_HOST_OS */ + +/** + * unify_open - + */ +int32_t +unify_open (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + fd_t *fd) +{ + unify_private_t *priv = this->private; + unify_local_t *local = NULL; + int16_t *list = NULL; + int16_t index = 0; + int16_t file_list[3] = {0,}; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* Init */ + INIT_LOCAL (frame, local); + loc_copy (&local->loc1, loc); + local->fd = fd; + local->flags = flags; + inode_ctx_get (loc->inode, this, &tmp_list); + list = (int16_t *)(long)tmp_list; + + local->list = list; + file_list[0] = priv->child_count; /* Thats namespace */ + file_list[2] = -1; + for (index = 0; list[index] != -1; index++) { + local->call_count++; + if (list[index] != priv->child_count) + file_list[1] = list[index]; + } + + if (local->call_count != 2) { + /* If the lookup was done for file */ + gf_log (this->name, GF_LOG_ERROR, + "%s: entry_count is %d", + loc->path, local->call_count); + for (index = 0; local->list[index] != -1; index++) + gf_log (this->name, GF_LOG_ERROR, "%s: found on %s", + loc->path, priv->xl_array[list[index]]->name); + + if (local->call_count < 2) { + gf_log (this->name, GF_LOG_ERROR, + "returning EIO as file found on onlyone node"); + STACK_UNWIND (frame, -1, EIO, fd); + return 0; + } + } + +#ifdef GF_DARWIN_HOST_OS + /* Handle symlink here */ + if (S_ISLNK (loc->inode->st_mode)) { + /* Callcount doesn't matter here */ + STACK_WIND (frame, + unify_open_readlink_cbk, + NS(this), + NS(this)->fops->readlink, + loc, ZR_PATH_MAX); + return 0; + } +#endif /* GF_DARWIN_HOST_OS */ + + local->call_count = 2; + for (index = 0; file_list[index] != -1; index++) { + char need_break = (file_list[index+1] == -1); + STACK_WIND_COOKIE (frame, + unify_open_cbk, + priv->xl_array[file_list[index]], //cookie + priv->xl_array[file_list[index]], + priv->xl_array[file_list[index]]->fops->open, + loc, + flags, + fd); + if (need_break) + break; + } + + return 0; +} + + +int32_t +unify_create_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + unify_local_t *local = frame->local; + inode_t *inode = local->loc1.inode; + + unify_local_wipe (local); + + STACK_UNWIND (frame, local->op_ret, local->op_errno, local->fd, + inode, &local->stbuf); + + return 0; +} + +/** + * unify_create_open_cbk - + */ +int32_t +unify_create_open_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + int ret = 0; + int32_t callcnt = 0; + unify_local_t *local = frame->local; + inode_t *inode = NULL; + xlator_t *child = NULL; + uint64_t tmp_value = 0; + + LOCK (&frame->lock); + { + if (op_ret >= 0) { + local->op_ret = op_ret; + if (NS(this) != (xlator_t *)cookie) { + /* Store child node's ptr, used in all + the f*** / FileIO calls */ + /* TODO: log on failure */ + ret = fd_ctx_get (fd, this, &tmp_value); + cookie = (void *)(long)tmp_value; + } else { + /* NOTE: open successful on namespace. + * fd's ctx can be used to identify open + * failure on storage subvolume. cool + * ide ;) */ + local->failed = 0; + } + } else { + gf_log (this->name, GF_LOG_ERROR, + "child(%s): path(%s): %s", + ((xlator_t *)cookie)->name, + local->loc1.path, strerror (op_errno)); + local->op_errno = op_errno; + local->failed = 1; + } + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed == 1 && (local->op_ret >= 0)) { + local->call_count = 1; + /* return -1 to user */ + local->op_ret = -1; + local->op_errno = EIO; + local->fd = fd; + local->call_count = 1; + + if (!fd_ctx_get (local->fd, this, &tmp_value)) { + child = (xlator_t *)(long)tmp_value; + + gf_log (this->name, GF_LOG_ERROR, + "Create success on child node, " + "failed on namespace"); + + STACK_WIND (frame, + unify_create_unlink_cbk, + child, + child->fops->unlink, + &local->loc1); + } else { + gf_log (this->name, GF_LOG_ERROR, + "Create success on namespace, " + "failed on child node"); + + STACK_WIND (frame, + unify_create_unlink_cbk, + NS(this), + NS(this)->fops->unlink, + &local->loc1); + } + return 0; + } + inode = local->loc1.inode; + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno, fd, + inode, &local->stbuf); + } + return 0; +} + +/** + * unify_create_lookup_cbk - + */ +int32_t +unify_create_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + int32_t callcnt = 0; + int16_t index = 0; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "child(%s): path(%s): %s", + priv->xl_array[(long)cookie]->name, + local->loc1.path, strerror (op_errno)); + local->op_errno = op_errno; + local->failed = 1; + } + + if (op_ret >= 0) { + local->op_ret = op_ret; + local->list[local->index++] = (int16_t)(long)cookie; + if (NS(this) == priv->xl_array[(long)cookie]) { + local->st_ino = buf->st_ino; + } else { + local->stbuf = *buf; + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + int16_t *list = local->list; + int16_t file_list[3] = {0,}; + local->op_ret = -1; + + local->list [local->index] = -1; + file_list[0] = list[0]; + file_list[1] = list[1]; + file_list[2] = -1; + + local->stbuf.st_ino = local->st_ino; + /* TODO: log on failure */ + inode_ctx_put (local->loc1.inode, this, + (uint64_t)(long)local->list); + + if (local->index != 2) { + /* Lookup failed, can't do open */ + gf_log (this->name, GF_LOG_ERROR, + "%s: present on %d nodes", + local->loc1.path, local->index); + file_list[0] = priv->child_count; + for (index = 0; list[index] != -1; index++) { + gf_log (this->name, GF_LOG_ERROR, + "%s: found on %s", local->loc1.path, + priv->xl_array[list[index]]->name); + if (list[index] != priv->child_count) + file_list[1] = list[index]; + } + + if (local->index < 2) { + unify_local_wipe (local); + gf_log (this->name, GF_LOG_ERROR, + "returning EIO as file found on " + "only one node"); + STACK_UNWIND (frame, -1, EIO, + local->fd, inode, NULL); + return 0; + } + } + /* Everything is perfect :) */ + local->call_count = 2; + + for (index = 0; file_list[index] != -1; index++) { + char need_break = (file_list[index+1] == -1); + STACK_WIND_COOKIE (frame, + unify_create_open_cbk, + priv->xl_array[file_list[index]], + priv->xl_array[file_list[index]], + priv->xl_array[file_list[index]]->fops->open, + &local->loc1, + local->flags, + local->fd); + if (need_break) + break; + } + } + + return 0; +} + + +/** + * unify_create_cbk - + */ +int32_t +unify_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf) +{ + int ret = 0; + unify_local_t *local = frame->local; + call_frame_t *prev_frame = cookie; + inode_t *tmp_inode = NULL; + + if (op_ret == -1) { + /* send unlink () on Namespace */ + local->op_errno = op_errno; + local->op_ret = -1; + local->call_count = 1; + gf_log (this->name, GF_LOG_ERROR, + "create failed on %s (file %s, error %s), " + "sending unlink to namespace", + prev_frame->this->name, + local->loc1.path, strerror (op_errno)); + + STACK_WIND (frame, + unify_create_unlink_cbk, + NS(this), + NS(this)->fops->unlink, + &local->loc1); + + return 0; + } + + if (op_ret >= 0) { + local->op_ret = op_ret; + local->stbuf = *buf; + /* Just inode number should be from NS node */ + local->stbuf.st_ino = local->st_ino; + + /* TODO: log on failure */ + ret = fd_ctx_set (fd, this, (uint64_t)(long)prev_frame->this); + } + + tmp_inode = local->loc1.inode; + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno, local->fd, + tmp_inode, &local->stbuf); + + return 0; +} + +/** + * unify_ns_create_cbk - + * + */ +int32_t +unify_ns_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf) +{ + struct sched_ops *sched_ops = NULL; + xlator_t *sched_xl = NULL; + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + int16_t *list = NULL; + int16_t index = 0; + + if (op_ret == -1) { + /* No need to send create request to other servers, as + namespace action failed. Handle exclusive create here. */ + if ((op_errno != EEXIST) || + ((op_errno == EEXIST) && + ((local->flags & O_EXCL) == O_EXCL))) { + /* If its just a create call without O_EXCL, + don't do this */ + gf_log (this->name, GF_LOG_ERROR, + "namespace: path(%s): %s", + local->loc1.path, strerror (op_errno)); + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); + return 0; + } + } + + if (op_ret >= 0) { + /* Get the inode number from the NS node */ + local->st_ino = buf->st_ino; + + local->op_ret = -1; + + /* Start the mapping list */ + list = CALLOC (1, sizeof (int16_t) * 3); + ERR_ABORT (list); + inode_ctx_put (inode, this, (uint64_t)(long)list); + list[0] = priv->child_count; + list[2] = -1; + + /* This means, file doesn't exist anywhere in the Filesystem */ + sched_ops = priv->sched_ops; + + /* Send create request to the scheduled node now */ + sched_xl = sched_ops->schedule (this, local->loc1.path); + if (sched_xl == NULL) + { + /* send unlink () on Namespace */ + local->op_errno = ENOTCONN; + local->op_ret = -1; + local->call_count = 1; + gf_log (this->name, GF_LOG_ERROR, + "no node online to schedule create:(file %s) " + "sending unlink to namespace", + (local->loc1.path)?local->loc1.path:""); + + STACK_WIND (frame, + unify_create_unlink_cbk, + NS(this), + NS(this)->fops->unlink, + &local->loc1); + + return 0; + } + + for (index = 0; index < priv->child_count; index++) + if (sched_xl == priv->xl_array[index]) + break; + list[1] = index; + + STACK_WIND (frame, unify_create_cbk, + sched_xl, sched_xl->fops->create, + &local->loc1, local->flags, local->mode, fd); + } else { + /* File already exists, and there is no O_EXCL flag */ + + gf_log (this->name, GF_LOG_DEBUG, + "File(%s) already exists on namespace, sending " + "open instead", local->loc1.path); + + local->list = CALLOC (1, sizeof (int16_t) * 3); + ERR_ABORT (local->list); + local->call_count = priv->child_count + 1; + local->op_ret = -1; + for (index = 0; index <= priv->child_count; index++) { + /* Send lookup() to all nodes including namespace */ + STACK_WIND_COOKIE (frame, + unify_create_lookup_cbk, + (void *)(long)index, + priv->xl_array[index], + priv->xl_array[index]->fops->lookup, + &local->loc1, + NULL); + } + } + return 0; +} + +/** + * unify_create - create a file in global namespace first, so other + * clients can see them. Create the file in storage nodes in background. + */ +int32_t +unify_create (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + mode_t mode, + fd_t *fd) +{ + unify_local_t *local = NULL; + + /* Initialization */ + INIT_LOCAL (frame, local); + local->mode = mode; + local->flags = flags; + local->fd = fd; + + loc_copy (&local->loc1, loc); + if (local->loc1.path == NULL) { + gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); + STACK_UNWIND (frame, -1, ENOMEM, fd, loc->inode, NULL); + return 0; + } + + STACK_WIND (frame, + unify_ns_create_cbk, + NS(this), + NS(this)->fops->create, + loc, + flags | O_EXCL, + mode, + fd); + + return 0; +} + + +/** + * unify_opendir_cbk - + */ +int32_t +unify_opendir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + STACK_UNWIND (frame, op_ret, op_errno, fd); + + return 0; +} + +/** + * unify_opendir - + */ +int32_t +unify_opendir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + fd_t *fd) +{ + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + STACK_WIND (frame, unify_opendir_cbk, + NS(this), NS(this)->fops->opendir, loc, fd); + + return 0; +} + + +/** + * unify_chmod - + */ +int32_t +unify_chmod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + unify_local_t *local = NULL; + unify_private_t *priv = this->private; + int32_t index = 0; + int32_t callcnt = 0; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* Initialization */ + INIT_LOCAL (frame, local); + + loc_copy (&local->loc1, loc); + local->st_ino = loc->inode->ino; + + if (S_ISDIR (loc->inode->st_mode)) { + local->call_count = priv->child_count + 1; + + for (index = 0; index < (priv->child_count + 1); index++) { + STACK_WIND (frame, + unify_buf_cbk, + priv->xl_array[index], + priv->xl_array[index]->fops->chmod, + loc, mode); + } + } else { + inode_ctx_get (loc->inode, this, &tmp_list); + local->list = (int16_t *)(long)tmp_list; + + for (index = 0; local->list[index] != -1; index++) { + local->call_count++; + callcnt++; + } + + for (index = 0; local->list[index] != -1; index++) { + STACK_WIND (frame, + unify_buf_cbk, + priv->xl_array[local->list[index]], + priv->xl_array[local->list[index]]->fops->chmod, + loc, + mode); + if (!--callcnt) + break; + } + } + + return 0; +} + +/** + * unify_chown - + */ +int32_t +unify_chown (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + uid_t uid, + gid_t gid) +{ + unify_local_t *local = NULL; + unify_private_t *priv = this->private; + int32_t index = 0; + int32_t callcnt = 0; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* Initialization */ + INIT_LOCAL (frame, local); + loc_copy (&local->loc1, loc); + local->st_ino = loc->inode->ino; + + if (S_ISDIR (loc->inode->st_mode)) { + local->call_count = priv->child_count + 1; + + for (index = 0; index < (priv->child_count + 1); index++) { + STACK_WIND (frame, + unify_buf_cbk, + priv->xl_array[index], + priv->xl_array[index]->fops->chown, + loc, uid, gid); + } + } else { + inode_ctx_get (loc->inode, this, &tmp_list); + local->list = (int16_t *)(long)tmp_list; + + for (index = 0; local->list[index] != -1; index++) { + local->call_count++; + callcnt++; + } + + for (index = 0; local->list[index] != -1; index++) { + STACK_WIND (frame, + unify_buf_cbk, + priv->xl_array[local->list[index]], + priv->xl_array[local->list[index]]->fops->chown, + loc, uid, gid); + if (!--callcnt) + break; + } + } + + return 0; +} + + +/** + * unify_truncate_cbk - + */ +int32_t +unify_truncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int32_t callcnt = 0; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + call_frame_t *prev_frame = cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "child(%s): path(%s): %s", + prev_frame->this->name, + (local->loc1.path)?local->loc1.path:"", + strerror (op_errno)); + local->op_errno = op_errno; + if (!((op_errno == ENOENT) && priv->optimist)) + local->op_ret = -1; + } + + if (op_ret >= 0) { + if (NS (this) == prev_frame->this) { + local->st_ino = buf->st_ino; + /* If the entry is directory, get the + stat from NS node */ + if (S_ISDIR (buf->st_mode) || + !local->stbuf.st_blksize) { + local->stbuf = *buf; + } + } + + if ((!S_ISDIR (buf->st_mode)) && + (NS (this) != prev_frame->this)) { + /* If file, take the stat info from + Storage node. */ + local->stbuf = *buf; + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->st_ino) + local->stbuf.st_ino = local->st_ino; + else + local->op_ret = -1; + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + } + + return 0; +} + +/** + * unify_truncate - + */ +int32_t +unify_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset) +{ + unify_local_t *local = NULL; + unify_private_t *priv = this->private; + int32_t index = 0; + int32_t callcnt = 0; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* Initialization */ + INIT_LOCAL (frame, local); + loc_copy (&local->loc1, loc); + local->st_ino = loc->inode->ino; + + if (S_ISDIR (loc->inode->st_mode)) { + local->call_count = 1; + + STACK_WIND (frame, + unify_buf_cbk, + NS(this), + NS(this)->fops->stat, + loc); + } else { + local->op_ret = 0; + inode_ctx_get (loc->inode, this, &tmp_list); + local->list = (int16_t *)(long)tmp_list; + + for (index = 0; local->list[index] != -1; index++) { + local->call_count++; + callcnt++; + } + + /* Don't send truncate to NS node */ + STACK_WIND (frame, unify_truncate_cbk, NS(this), + NS(this)->fops->stat, loc); + callcnt--; + + for (index = 0; local->list[index] != -1; index++) { + if (NS(this) != priv->xl_array[local->list[index]]) { + STACK_WIND (frame, + unify_truncate_cbk, + priv->xl_array[local->list[index]], + priv->xl_array[local->list[index]]->fops->truncate, + loc, + offset); + if (!--callcnt) + break; + } + } + } + + return 0; +} + +/** + * unify_utimens - + */ +int32_t +unify_utimens (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec tv[2]) +{ + unify_local_t *local = NULL; + unify_private_t *priv = this->private; + int32_t index = 0; + int32_t callcnt = 0; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* Initialization */ + INIT_LOCAL (frame, local); + loc_copy (&local->loc1, loc); + local->st_ino = loc->inode->ino; + + if (S_ISDIR (loc->inode->st_mode)) { + local->call_count = priv->child_count + 1; + + for (index = 0; index < (priv->child_count + 1); index++) { + STACK_WIND (frame, + unify_buf_cbk, + priv->xl_array[index], + priv->xl_array[index]->fops->utimens, + loc, tv); + } + } else { + inode_ctx_get (loc->inode, this, &tmp_list); + local->list = (int16_t *)(long)tmp_list; + + for (index = 0; local->list[index] != -1; index++) { + local->call_count++; + callcnt++; + } + + for (index = 0; local->list[index] != -1; index++) { + STACK_WIND (frame, + unify_buf_cbk, + priv->xl_array[local->list[index]], + priv->xl_array[local->list[index]]->fops->utimens, + loc, + tv); + if (!--callcnt) + break; + } + } + + return 0; +} + +/** + * unify_readlink_cbk - + */ +int32_t +unify_readlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + const char *path) +{ + STACK_UNWIND (frame, op_ret, op_errno, path); + return 0; +} + +/** + * unify_readlink - Read the link only from the storage node. + */ +int32_t +unify_readlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + size_t size) +{ + unify_private_t *priv = this->private; + int32_t entry_count = 0; + int16_t *list = NULL; + int16_t index = 0; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + inode_ctx_get (loc->inode, this, &tmp_list); + list = (int16_t *)(long)tmp_list; + + for (index = 0; list[index] != -1; index++) + entry_count++; + + if (entry_count >= 2) { + for (index = 0; list[index] != -1; index++) { + if (priv->xl_array[list[index]] != NS(this)) { + STACK_WIND (frame, + unify_readlink_cbk, + priv->xl_array[list[index]], + priv->xl_array[list[index]]->fops->readlink, + loc, + size); + break; + } + } + } else { + gf_log (this->name, GF_LOG_ERROR, + "returning ENOENT, no softlink files found " + "on storage node"); + STACK_UNWIND (frame, -1, ENOENT, NULL); + } + + return 0; +} + + +/** + * unify_unlink_cbk - + */ +int32_t +unify_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = 0; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == 0 || ((op_errno == ENOENT) && priv->optimist)) + local->op_ret = 0; + if (op_ret == -1) + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno); + } + + return 0; +} + + +/** + * unify_unlink - + */ +int32_t +unify_unlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + unify_private_t *priv = this->private; + unify_local_t *local = NULL; + int16_t *list = NULL; + int16_t index = 0; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* Initialization */ + INIT_LOCAL (frame, local); + loc_copy (&local->loc1, loc); + + inode_ctx_get (loc->inode, this, &tmp_list); + list = (int16_t *)(long)tmp_list; + + for (index = 0; list[index] != -1; index++) + local->call_count++; + + if (local->call_count) { + for (index = 0; list[index] != -1; index++) { + char need_break = (list[index+1] == -1); + STACK_WIND (frame, + unify_unlink_cbk, + priv->xl_array[list[index]], + priv->xl_array[list[index]]->fops->unlink, + loc); + if (need_break) + break; + } + } else { + gf_log (this->name, GF_LOG_ERROR, + "%s: returning ENOENT", loc->path); + STACK_UNWIND (frame, -1, ENOENT); + } + + return 0; +} + + +/** + * unify_readv_cbk - + */ +int32_t +unify_readv_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) +{ + STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf); + return 0; +} + +/** + * unify_readv - + */ +int32_t +unify_readv (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + fd_ctx_get (fd, this, &tmp_child); + child = (xlator_t *)(long)tmp_child; + + STACK_WIND (frame, + unify_readv_cbk, + child, + child->fops->readv, + fd, + size, + offset); + + + return 0; +} + +/** + * unify_writev_cbk - + */ +int32_t +unify_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + STACK_UNWIND (frame, op_ret, op_errno, stbuf); + return 0; +} + +/** + * unify_writev - + */ +int32_t +unify_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t off) +{ + UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + fd_ctx_get (fd, this, &tmp_child); + child = (xlator_t *)(long)tmp_child; + + STACK_WIND (frame, + unify_writev_cbk, + child, + child->fops->writev, + fd, + vector, + count, + off); + + return 0; +} + +/** + * unify_ftruncate - + */ +int32_t +unify_ftruncate (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset) +{ + xlator_t *child = NULL; + unify_local_t *local = NULL; + uint64_t tmp_child = 0; + + UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR(fd); + + /* Initialization */ + INIT_LOCAL (frame, local); + local->op_ret = 0; + + fd_ctx_get (fd, this, &tmp_child); + child = (xlator_t *)(long)tmp_child; + + local->call_count = 2; + + STACK_WIND (frame, unify_truncate_cbk, + child, child->fops->ftruncate, + fd, offset); + + STACK_WIND (frame, unify_truncate_cbk, + NS(this), NS(this)->fops->fstat, + fd); + + return 0; +} + + +/** + * unify_fchmod - + */ +int32_t +unify_fchmod (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + mode_t mode) +{ + unify_local_t *local = NULL; + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd); + + /* Initialization */ + INIT_LOCAL (frame, local); + local->st_ino = fd->inode->ino; + + if (!fd_ctx_get (fd, this, &tmp_child)) { + /* If its set, then its file */ + child = (xlator_t *)(long)tmp_child; + + local->call_count = 2; + + STACK_WIND (frame, unify_buf_cbk, child, + child->fops->fchmod, fd, mode); + + STACK_WIND (frame, unify_buf_cbk, NS(this), + NS(this)->fops->fchmod, fd, mode); + + } else { + /* this is an directory */ + local->call_count = 1; + + STACK_WIND (frame, unify_buf_cbk, + NS(this), NS(this)->fops->fchmod, fd, mode); + } + + return 0; +} + +/** + * unify_fchown - + */ +int32_t +unify_fchown (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + uid_t uid, + gid_t gid) +{ + unify_local_t *local = NULL; + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd); + + /* Initialization */ + INIT_LOCAL (frame, local); + local->st_ino = fd->inode->ino; + + if (!fd_ctx_get (fd, this, &tmp_child)) { + /* If its set, then its file */ + child = (xlator_t *)(long)tmp_child; + + local->call_count = 2; + + STACK_WIND (frame, unify_buf_cbk, child, + child->fops->fchown, fd, uid, gid); + + STACK_WIND (frame, unify_buf_cbk, NS(this), + NS(this)->fops->fchown, fd, uid, gid); + } else { + local->call_count = 1; + + STACK_WIND (frame, unify_buf_cbk, + NS(this), NS(this)->fops->fchown, + fd, uid, gid); + } + + return 0; +} + +/** + * unify_flush_cbk - + */ +int32_t +unify_flush_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +/** + * unify_flush - + */ +int32_t +unify_flush (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + fd_ctx_get (fd, this, &tmp_child); + child = (xlator_t *)(long)tmp_child; + + STACK_WIND (frame, unify_flush_cbk, child, + child->fops->flush, fd); + + return 0; +} + + +/** + * unify_fsync_cbk - + */ +int32_t +unify_fsync_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +/** + * unify_fsync - + */ +int32_t +unify_fsync (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + fd_ctx_get (fd, this, &tmp_child); + child = (xlator_t *)(long)tmp_child; + + STACK_WIND (frame, unify_fsync_cbk, child, + child->fops->fsync, fd, flags); + + return 0; +} + +/** + * unify_fstat - Send fstat FOP to Namespace only if its directory, and to + * both namespace and the storage node if its a file. + */ +int32_t +unify_fstat (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + unify_local_t *local = NULL; + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd); + + INIT_LOCAL (frame, local); + local->st_ino = fd->inode->ino; + + if (!fd_ctx_get (fd, this, &tmp_child)) { + /* If its set, then its file */ + child = (xlator_t *)(long)tmp_child; + local->call_count = 2; + + STACK_WIND (frame, unify_buf_cbk, child, + child->fops->fstat, fd); + + STACK_WIND (frame, unify_buf_cbk, NS(this), + NS(this)->fops->fstat, fd); + + } else { + /* this is an directory */ + local->call_count = 1; + STACK_WIND (frame, unify_buf_cbk, NS(this), + NS(this)->fops->fstat, fd); + } + + return 0; +} + +/** + * unify_getdents_cbk - + */ +int32_t +unify_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entry, + int32_t count) +{ + STACK_UNWIND (frame, op_ret, op_errno, entry, count); + return 0; +} + +/** + * unify_getdents - send the FOP request to all the nodes. + */ +int32_t +unify_getdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset, + int32_t flag) +{ + UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd); + + STACK_WIND (frame, unify_getdents_cbk, NS(this), + NS(this)->fops->getdents, fd, size, offset, flag); + + return 0; +} + + +/** + * unify_readdir_cbk - + */ +int32_t +unify_readdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + gf_dirent_t *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + + return 0; +} + +/** + * unify_readdir - send the FOP request to all the nodes. + */ +int32_t +unify_readdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd); + + STACK_WIND (frame, unify_readdir_cbk, NS(this), + NS(this)->fops->readdir, fd, size, offset); + + return 0; +} + + +/** + * unify_fsyncdir_cbk - + */ +int32_t +unify_fsyncdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + + return 0; +} + +/** + * unify_fsyncdir - + */ +int32_t +unify_fsyncdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd); + + STACK_WIND (frame, unify_fsyncdir_cbk, + NS(this), NS(this)->fops->fsyncdir, fd, flags); + + return 0; +} + +/** + * unify_lk_cbk - UNWIND frame with the proper return arguments. + */ +int32_t +unify_lk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *lock) +{ + STACK_UNWIND (frame, op_ret, op_errno, lock); + return 0; +} + +/** + * unify_lk - Send it to all the storage nodes, (should be 1) which has file. + */ +int32_t +unify_lk (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct flock *lock) +{ + UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + fd_ctx_get (fd, this, &tmp_child); + child = (xlator_t *)(long)tmp_child; + + STACK_WIND (frame, unify_lk_cbk, child, + child->fops->lk, fd, cmd, lock); + + return 0; +} + + +int32_t +unify_setxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno); + +static int32_t +unify_setxattr_file_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + unify_private_t *private = this->private; + unify_local_t *local = frame->local; + xlator_t *sched_xl = NULL; + struct sched_ops *sched_ops = NULL; + + if (op_ret == -1) { + if (!ENOTSUP) + gf_log (this->name, GF_LOG_ERROR, + "setxattr with XATTR_CREATE on ns: " + "path(%s) key(%s): %s", + local->loc1.path, local->name, + strerror (op_errno)); + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno); + return 0; + } + + LOCK (&frame->lock); + { + local->failed = 0; + local->op_ret = 0; + local->op_errno = 0; + local->call_count = 1; + } + UNLOCK (&frame->lock); + + /* schedule XATTR_CREATE on one of the child node */ + sched_ops = private->sched_ops; + + /* Send create request to the scheduled node now */ + sched_xl = sched_ops->schedule (this, local->name); + if (!sched_xl) { + STACK_UNWIND (frame, -1, ENOTCONN); + return 0; + } + + STACK_WIND (frame, + unify_setxattr_cbk, + sched_xl, + sched_xl->fops->setxattr, + &local->loc1, + local->dict, + local->flags); + return 0; +} + +/** + * unify_setxattr_cbk - When all the child nodes return, UNWIND frame. + */ +int32_t +unify_setxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = 0; + unify_local_t *local = frame->local; + call_frame_t *prev_frame = cookie; + dict_t *dict = NULL; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, (((op_errno == ENOENT) || + (op_errno == ENOTSUP))? + GF_LOG_DEBUG : GF_LOG_ERROR), + "child(%s): path(%s): %s", + prev_frame->this->name, + (local->loc1.path)?local->loc1.path:"", + strerror (op_errno)); + if (local->failed == -1) { + local->failed = 1; + } + local->op_errno = op_errno; + } else { + local->failed = 0; + local->op_ret = op_ret; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed && local->name && + ZR_FILE_CONTENT_REQUEST(local->name)) { + dict = get_new_dict (); + dict_set (dict, local->dict->members_list->key, + data_from_dynptr(NULL, 0)); + dict_ref (dict); + + local->call_count = 1; + + STACK_WIND (frame, + unify_setxattr_file_cbk, + NS(this), + NS(this)->fops->setxattr, + &local->loc1, + dict, + XATTR_CREATE); + + dict_unref (dict); + return 0; + } + + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno); + } + + return 0; +} + +/** + * unify_sexattr - This function should be sent to all the storage nodes, + * which contains the file, (excluding namespace). + */ +int32_t +unify_setxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *dict, + int32_t flags) +{ + unify_private_t *priv = this->private; + unify_local_t *local = NULL; + int16_t *list = NULL; + int16_t index = 0; + int32_t call_count = 0; + uint64_t tmp_list = 0; + data_pair_t *trav = dict->members_list; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* Initialization */ + INIT_LOCAL (frame, local); + local->failed = -1; + loc_copy (&local->loc1, loc); + + if (S_ISDIR (loc->inode->st_mode)) { + + if (trav && trav->key && ZR_FILE_CONTENT_REQUEST(trav->key)) { + /* direct the storage xlators to change file + content only if file exists */ + local->flags = flags; + local->dict = dict; + local->name = strdup (trav->key); + flags |= XATTR_REPLACE; + } + + local->call_count = priv->child_count; + for (index = 0; index < priv->child_count; index++) { + STACK_WIND (frame, + unify_setxattr_cbk, + priv->xl_array[index], + priv->xl_array[index]->fops->setxattr, + loc, dict, flags); + } + return 0; + } + + inode_ctx_get (loc->inode, this, &tmp_list); + list = (int16_t *)(long)tmp_list; + + for (index = 0; list[index] != -1; index++) { + if (NS(this) != priv->xl_array[list[index]]) { + local->call_count++; + call_count++; + } + } + + if (local->call_count) { + for (index = 0; list[index] != -1; index++) { + if (priv->xl_array[list[index]] != NS(this)) { + STACK_WIND (frame, + unify_setxattr_cbk, + priv->xl_array[list[index]], + priv->xl_array[list[index]]->fops->setxattr, + loc, + dict, + flags); + if (!--call_count) + break; + } + } + return 0; + } + + /* No entry in storage nodes */ + gf_log (this->name, GF_LOG_DEBUG, + "returning ENOENT, file not found on storage node."); + STACK_UNWIND (frame, -1, ENOENT); + + return 0; +} + + +/** + * unify_getxattr_cbk - This function is called from only one child, so, no + * need of any lock or anything else, just send it to above layer + */ +int32_t +unify_getxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *value) +{ + int32_t callcnt = 0; + dict_t *local_value = NULL; + unify_local_t *local = frame->local; + call_frame_t *prev_frame = cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, + (((op_errno == ENOENT) || + (op_errno == ENODATA) || + (op_errno == ENOTSUP)) ? + GF_LOG_DEBUG : GF_LOG_ERROR), + "child(%s): path(%s): %s", + prev_frame->this->name, + (local->loc1.path)?local->loc1.path:"", + strerror (op_errno)); + } else { + if (!local->dict) + local->dict = dict_ref (value); + local->op_ret = op_ret; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + local_value = local->dict; + local->dict = NULL; + + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local_value); + + if (local_value) + dict_unref (local_value); + } + + return 0; +} + + +/** + * unify_getxattr - This FOP is sent to only the storage node. + */ +int32_t +unify_getxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + unify_private_t *priv = this->private; + int16_t *list = NULL; + int16_t index = 0; + int16_t count = 0; + unify_local_t *local = NULL; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + INIT_LOCAL (frame, local); + + if (S_ISDIR (loc->inode->st_mode)) { + local->call_count = priv->child_count; + for (index = 0; index < priv->child_count; index++) + STACK_WIND (frame, + unify_getxattr_cbk, + priv->xl_array[index], + priv->xl_array[index]->fops->getxattr, + loc, + name); + return 0; + } + + inode_ctx_get (loc->inode, this, &tmp_list); + list = (int16_t *)(long)tmp_list; + + for (index = 0; list[index] != -1; index++) { + if (NS(this) != priv->xl_array[list[index]]) { + local->call_count++; + count++; + } + } + + if (count) { + for (index = 0; list[index] != -1; index++) { + if (priv->xl_array[list[index]] != NS(this)) { + STACK_WIND (frame, + unify_getxattr_cbk, + priv->xl_array[list[index]], + priv->xl_array[list[index]]->fops->getxattr, + loc, + name); + if (!--count) + break; + } + } + } else { + dict_t *tmp_dict = get_new_dict (); + gf_log (this->name, GF_LOG_DEBUG, + "%s: returning ENODATA, no file found on storage node", + loc->path); + STACK_UNWIND (frame, -1, ENODATA, tmp_dict); + dict_destroy (tmp_dict); + } + + return 0; +} + +/** + * unify_removexattr_cbk - Wait till all the child node returns the call + * and then UNWIND to above layer. + */ +int32_t +unify_removexattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = 0; + unify_local_t *local = frame->local; + call_frame_t *prev_frame = cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) { + local->op_errno = op_errno; + if (op_errno != ENOTSUP) + gf_log (this->name, GF_LOG_ERROR, + "child(%s): path(%s): %s", + prev_frame->this->name, + local->loc1.path, strerror (op_errno)); + } else { + local->op_ret = op_ret; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + STACK_UNWIND (frame, local->op_ret, local->op_errno); + } + + return 0; +} + +/** + * unify_removexattr - Send it to all the child nodes which has the files. + */ +int32_t +unify_removexattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + unify_private_t *priv = this->private; + unify_local_t *local = NULL; + int16_t *list = NULL; + int16_t index = 0; + int32_t call_count = 0; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* Initialization */ + INIT_LOCAL (frame, local); + + if (S_ISDIR (loc->inode->st_mode)) { + local->call_count = priv->child_count; + for (index = 0; index < priv->child_count; index++) + STACK_WIND (frame, + unify_removexattr_cbk, + priv->xl_array[index], + priv->xl_array[index]->fops->removexattr, + loc, + name); + + return 0; + } + + inode_ctx_get (loc->inode, this, &tmp_list); + list = (int16_t *)(long)tmp_list; + + for (index = 0; list[index] != -1; index++) { + if (NS(this) != priv->xl_array[list[index]]) { + local->call_count++; + call_count++; + } + } + + if (local->call_count) { + for (index = 0; list[index] != -1; index++) { + if (priv->xl_array[list[index]] != NS(this)) { + STACK_WIND (frame, + unify_removexattr_cbk, + priv->xl_array[list[index]], + priv->xl_array[list[index]]->fops->removexattr, + loc, + name); + if (!--call_count) + break; + } + } + return 0; + } + + gf_log (this->name, GF_LOG_DEBUG, + "%s: returning ENOENT, not found on storage node.", loc->path); + STACK_UNWIND (frame, -1, ENOENT); + + return 0; +} + + +int32_t +unify_mknod_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + unify_local_t *local = frame->local; + + if (op_ret == -1) + gf_log (this->name, GF_LOG_ERROR, + "%s: %s", local->loc1.path, strerror (op_errno)); + + unify_local_wipe (local); + /* No log required here as this -1 is for mknod call */ + STACK_UNWIND (frame, -1, local->op_errno, NULL, NULL); + return 0; +} + +/** + * unify_mknod_cbk - + */ +int32_t +unify_mknod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + unify_local_t *local = frame->local; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "mknod failed on storage node, sending unlink to " + "namespace"); + local->op_errno = op_errno; + STACK_WIND (frame, + unify_mknod_unlink_cbk, + NS(this), + NS(this)->fops->unlink, + &local->loc1); + return 0; + } + + local->stbuf = *buf; + local->stbuf.st_ino = local->st_ino; + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf); + return 0; +} + +/** + * unify_ns_mknod_cbk - + */ +int32_t +unify_ns_mknod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + struct sched_ops *sched_ops = NULL; + xlator_t *sched_xl = NULL; + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + int16_t *list = NULL; + int16_t index = 0; + call_frame_t *prev_frame = cookie; + + if (op_ret == -1) { + /* No need to send mknod request to other servers, + * as namespace action failed + */ + gf_log (this->name, GF_LOG_ERROR, + "child(%s): path(%s): %s", + prev_frame->this->name, local->loc1.path, + strerror (op_errno)); + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; + } + + /* Create one inode for this entry */ + local->op_ret = 0; + local->stbuf = *buf; + local->st_ino = buf->st_ino; + + list = CALLOC (1, sizeof (int16_t) * 3); + ERR_ABORT (list); + list[0] = priv->child_count; + list[2] = -1; + inode_ctx_put (inode, this, (uint64_t)(long)list); + + sched_ops = priv->sched_ops; + + /* Send mknod request to scheduled node now */ + sched_xl = sched_ops->schedule (this, local->loc1.path); + if (!sched_xl) { + gf_log (this->name, GF_LOG_ERROR, + "mknod failed on storage node, no node online " + "at the moment, sending unlink to NS"); + local->op_errno = ENOTCONN; + STACK_WIND (frame, + unify_mknod_unlink_cbk, + NS(this), + NS(this)->fops->unlink, + &local->loc1); + + return 0; + } + + for (index = 0; index < priv->child_count; index++) + if (sched_xl == priv->xl_array[index]) + break; + list[1] = index; + + STACK_WIND (frame, unify_mknod_cbk, + sched_xl, sched_xl->fops->mknod, + &local->loc1, local->mode, local->dev); + + return 0; +} + +/** + * unify_mknod - Create a device on namespace first, and later create on + * the storage node. + */ +int32_t +unify_mknod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode, + dev_t rdev) +{ + unify_local_t *local = NULL; + + /* Initialization */ + INIT_LOCAL (frame, local); + local->mode = mode; + local->dev = rdev; + loc_copy (&local->loc1, loc); + if (local->loc1.path == NULL) { + gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); + STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL); + return 0; + } + + STACK_WIND (frame, + unify_ns_mknod_cbk, + NS(this), + NS(this)->fops->mknod, + loc, + mode, + rdev); + + return 0; +} + +int32_t +unify_symlink_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + unify_local_t *local = frame->local; + if (op_ret == -1) + gf_log (this->name, GF_LOG_ERROR, + "%s: %s", local->loc1.path, strerror (op_errno)); + + unify_local_wipe (local); + STACK_UNWIND (frame, -1, local->op_errno, NULL, NULL); + return 0; +} + +/** + * unify_symlink_cbk - + */ +int32_t +unify_symlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + unify_local_t *local = frame->local; + + if (op_ret == -1) { + /* Symlink on storage node failed, hence send unlink + to the NS node */ + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, + "symlink on storage node failed, sending unlink " + "to namespace"); + + STACK_WIND (frame, + unify_symlink_unlink_cbk, + NS(this), + NS(this)->fops->unlink, + &local->loc1); + + return 0; + } + + local->stbuf = *buf; + local->stbuf.st_ino = local->st_ino; + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf); + + return 0; +} + +/** + * unify_ns_symlink_cbk - + */ +int32_t +unify_ns_symlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + + struct sched_ops *sched_ops = NULL; + xlator_t *sched_xl = NULL; + int16_t *list = NULL; + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + int16_t index = 0; + + if (op_ret == -1) { + /* No need to send symlink request to other servers, + * as namespace action failed + */ + gf_log (this->name, GF_LOG_ERROR, + "namespace: path(%s): %s", + local->loc1.path, strerror (op_errno)); + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno, NULL, buf); + return 0; + } + + /* Create one inode for this entry */ + local->op_ret = 0; + local->st_ino = buf->st_ino; + + /* Start the mapping list */ + + list = CALLOC (1, sizeof (int16_t) * 3); + ERR_ABORT (list); + list[0] = priv->child_count; //namespace's index + list[2] = -1; + inode_ctx_put (inode, this, (uint64_t)(long)list); + + sched_ops = priv->sched_ops; + + /* Send symlink request to all the nodes now */ + sched_xl = sched_ops->schedule (this, local->loc1.path); + if (!sched_xl) { + /* Symlink on storage node failed, hence send unlink + to the NS node */ + local->op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "symlink on storage node failed, no node online, " + "sending unlink to namespace"); + + STACK_WIND (frame, + unify_symlink_unlink_cbk, + NS(this), + NS(this)->fops->unlink, + &local->loc1); + + return 0; + } + + for (index = 0; index < priv->child_count; index++) + if (sched_xl == priv->xl_array[index]) + break; + list[1] = index; + + STACK_WIND (frame, + unify_symlink_cbk, + sched_xl, + sched_xl->fops->symlink, + local->name, + &local->loc1); + + return 0; +} + +/** + * unify_symlink - + */ +int32_t +unify_symlink (call_frame_t *frame, + xlator_t *this, + const char *linkpath, + loc_t *loc) +{ + unify_local_t *local = NULL; + + /* Initialization */ + INIT_LOCAL (frame, local); + loc_copy (&local->loc1, loc); + local->name = strdup (linkpath); + + if ((local->name == NULL) || + (local->loc1.path == NULL)) { + gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); + STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL); + return 0; + } + + STACK_WIND (frame, + unify_ns_symlink_cbk, + NS(this), + NS(this)->fops->symlink, + linkpath, + loc); + + return 0; +} + + +int32_t +unify_rename_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = 0; + unify_local_t *local = frame->local; + call_frame_t *prev_frame = cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "child(%s): path(%s -> %s): %s", + prev_frame->this->name, + local->loc1.path, local->loc2.path, + strerror (op_errno)); + + } + LOCK (&frame->lock); + { + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + local->stbuf.st_ino = local->st_ino; + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + } + return 0; +} + +int32_t +unify_ns_rename_undo_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + unify_local_t *local = frame->local; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "namespace: path(%s -> %s): %s", + local->loc1.path, local->loc2.path, + strerror (op_errno)); + } + + local->stbuf.st_ino = local->st_ino; + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno, &local->stbuf); + return 0; +} + +int32_t +unify_rename_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int32_t index = 0; + int32_t callcnt = 0; + int16_t *list = NULL; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + call_frame_t *prev_frame = cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret >= 0) { + if (!S_ISDIR (buf->st_mode)) + local->stbuf = *buf; + local->op_ret = op_ret; + } else { + gf_log (this->name, GF_LOG_ERROR, + "child(%s): path(%s -> %s): %s", + prev_frame->this->name, + local->loc1.path, local->loc2.path, + strerror (op_errno)); + local->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + local->stbuf.st_ino = local->st_ino; + if (S_ISDIR (local->loc1.inode->st_mode)) { + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno, &local->stbuf); + return 0; + } + + if (local->op_ret == -1) { + /* TODO: check this logic */ + + /* Rename failed in storage node, successful on NS, + * hence, rename back the entries in NS */ + /* NOTE: this will be done only if the destination + * doesn't exists, if the destination exists, the + * job of correcting NS is left to self-heal + */ + if (!local->index) { + loc_t tmp_oldloc = { + /* its actual 'newloc->path' */ + .path = local->loc2.path, + .inode = local->loc1.inode, + .parent = local->loc2.parent + }; + + loc_t tmp_newloc = { + /* Actual 'oldloc->path' */ + .path = local->loc1.path, + .parent = local->loc1.parent + }; + + gf_log (this->name, GF_LOG_ERROR, + "rename succussful on namespace, on " + "stroage node failed, reverting back"); + + STACK_WIND (frame, + unify_ns_rename_undo_cbk, + NS(this), + NS(this)->fops->rename, + &tmp_oldloc, + &tmp_newloc); + return 0; + } + } else { + /* Rename successful on storage nodes */ + + int32_t idx = 0; + int16_t *tmp_list = NULL; + uint64_t tmp_list_int64 = 0; + if (local->loc2.inode) { + inode_ctx_get (local->loc2.inode, + this, &tmp_list_int64); + list = (int16_t *)(long)tmp_list_int64; + + } + + if (list) { + for (index = 0; list[index] != -1; index++); + tmp_list = CALLOC (1, index * 2); + memcpy (tmp_list, list, index * 2); + + for (index = 0; list[index] != -1; index++) { + /* TODO: Check this logic. */ + /* If the destination file exists in + * the same storage node where we sent + * 'rename' call, no need to send + * unlink + */ + for (idx = 0; + local->list[idx] != -1; idx++) { + if (tmp_list[index] == local->list[idx]) { + tmp_list[index] = priv->child_count; + continue; + } + } + + if (NS(this) != priv->xl_array[tmp_list[index]]) { + local->call_count++; + callcnt++; + } + } + + if (local->call_count) { + if (callcnt > 1) + gf_log (this->name, + GF_LOG_ERROR, + "%s->%s: more (%d) " + "subvolumes have the " + "newloc entry", + local->loc1.path, + local->loc2.path, + callcnt); + + for (index=0; + tmp_list[index] != -1; index++) { + if (NS(this) != priv->xl_array[tmp_list[index]]) { + STACK_WIND (frame, + unify_rename_unlink_cbk, + priv->xl_array[tmp_list[index]], + priv->xl_array[tmp_list[index]]->fops->unlink, + &local->loc2); + if (!--callcnt) + break; + } + } + + FREE (tmp_list); + return 0; + } + if (tmp_list) + FREE (tmp_list); + } + } + + /* Need not send 'unlink' to storage node */ + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, + local->op_errno, &local->stbuf); + } + + return 0; +} + +int32_t +unify_ns_rename_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int32_t index = 0; + int32_t callcnt = 0; + int16_t *list = NULL; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + + if (op_ret == -1) { + /* Free local->new_inode */ + gf_log (this->name, GF_LOG_ERROR, + "namespace: path(%s -> %s): %s", + local->loc1.path, local->loc2.path, + strerror (op_errno)); + + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; + } + + local->stbuf = *buf; + local->st_ino = buf->st_ino; + + /* Everything is fine. */ + if (S_ISDIR (buf->st_mode)) { + local->call_count = priv->child_count; + for (index=0; index < priv->child_count; index++) { + STACK_WIND (frame, + unify_rename_cbk, + priv->xl_array[index], + priv->xl_array[index]->fops->rename, + &local->loc1, + &local->loc2); + } + + return 0; + } + + local->call_count = 0; + /* send rename */ + list = local->list; + for (index=0; list[index] != -1; index++) { + if (NS(this) != priv->xl_array[list[index]]) { + local->call_count++; + callcnt++; + } + } + + if (local->call_count) { + for (index=0; list[index] != -1; index++) { + if (NS(this) != priv->xl_array[list[index]]) { + STACK_WIND (frame, + unify_rename_cbk, + priv->xl_array[list[index]], + priv->xl_array[list[index]]->fops->rename, + &local->loc1, + &local->loc2); + if (!--callcnt) + break; + } + } + } else { + /* file doesn't seem to be present in storage nodes */ + gf_log (this->name, GF_LOG_CRITICAL, + "CRITICAL: source file not in storage node, " + "rename successful on namespace :O"); + unify_local_wipe (local); + STACK_UNWIND (frame, -1, EIO, NULL); + } + return 0; +} + + +/** + * unify_rename - One of the tricky function. The deadliest of all :O + */ +int32_t +unify_rename (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + unify_local_t *local = NULL; + uint64_t tmp_list = 0; + + /* Initialization */ + INIT_LOCAL (frame, local); + loc_copy (&local->loc1, oldloc); + loc_copy (&local->loc2, newloc); + + if ((local->loc1.path == NULL) || + (local->loc2.path == NULL)) { + gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); + STACK_UNWIND (frame, -1, ENOMEM, NULL); + return 0; + } + + inode_ctx_get (oldloc->inode, this, &tmp_list); + local->list = (int16_t *)(long)tmp_list; + + STACK_WIND (frame, + unify_ns_rename_cbk, + NS(this), + NS(this)->fops->rename, + oldloc, + newloc); + return 0; +} + +/** + * unify_link_cbk - + */ +int32_t +unify_link_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + unify_local_t *local = frame->local; + + if (op_ret >= 0) + local->stbuf = *buf; + local->stbuf.st_ino = local->st_ino; + + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf); + + return 0; +} + +/** + * unify_ns_link_cbk - + */ +int32_t +unify_ns_link_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + int16_t *list = local->list; + int16_t index = 0; + + if (op_ret == -1) { + /* No need to send link request to other servers, + * as namespace action failed + */ + gf_log (this->name, GF_LOG_ERROR, + "namespace: path(%s -> %s): %s", + local->loc1.path, local->loc2.path, + strerror (op_errno)); + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; + } + + /* Update inode for this entry */ + local->op_ret = 0; + local->st_ino = buf->st_ino; + + /* Send link request to the node now */ + for (index = 0; list[index] != -1; index++) { + char need_break = (list[index+1] == -1); + if (priv->xl_array[list[index]] != NS (this)) { + STACK_WIND (frame, + unify_link_cbk, + priv->xl_array[list[index]], + priv->xl_array[list[index]]->fops->link, + &local->loc1, + &local->loc2); + } + if (need_break) + break; + } + + return 0; +} + +/** + * unify_link - + */ +int32_t +unify_link (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + unify_local_t *local = NULL; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (oldloc); + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (newloc); + + /* Initialization */ + INIT_LOCAL (frame, local); + + loc_copy (&local->loc1, oldloc); + loc_copy (&local->loc2, newloc); + + inode_ctx_get (oldloc->inode, this, &tmp_list); + local->list = (int16_t *)(long)tmp_list; + + STACK_WIND (frame, + unify_ns_link_cbk, + NS(this), + NS(this)->fops->link, + oldloc, + newloc); + + return 0; +} + + +/** + * unify_checksum_cbk - + */ +int32_t +unify_checksum_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + uint8_t *fchecksum, + uint8_t *dchecksum) +{ + STACK_UNWIND (frame, op_ret, op_errno, fchecksum, dchecksum); + + return 0; +} + +/** + * unify_checksum - + */ +int32_t +unify_checksum (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flag) +{ + STACK_WIND (frame, + unify_checksum_cbk, + NS(this), + NS(this)->fops->checksum, + loc, + flag); + + return 0; +} + + +/** + * unify_finodelk_cbk - + */ +int +unify_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +/** + * unify_finodelk + */ +int +unify_finodelk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int cmd, struct flock *flock) +{ + UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + fd_ctx_get (fd, this, &tmp_child); + child = (xlator_t *)(long)tmp_child; + + STACK_WIND (frame, unify_finodelk_cbk, + child, child->fops->finodelk, + fd, cmd, flock); + + return 0; +} + + + +/** + * unify_fentrylk_cbk - + */ +int +unify_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +/** + * unify_fentrylk + */ +int +unify_fentrylk (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type) + +{ + UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + fd_ctx_get (fd, this, &tmp_child); + child = (xlator_t *)(long)tmp_child; + + STACK_WIND (frame, unify_fentrylk_cbk, + child, child->fops->fentrylk, + fd, basename, cmd, type); + + return 0; +} + + + +/** + * unify_fxattrop_cbk - + */ +int +unify_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr) +{ + STACK_UNWIND (frame, op_ret, op_errno, xattr); + return 0; +} + +/** + * unify_fxattrop + */ +int +unify_fxattrop (call_frame_t *frame, xlator_t *this, + fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr) +{ + UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + fd_ctx_get (fd, this, &tmp_child); + child = (xlator_t *)(long)tmp_child; + + STACK_WIND (frame, unify_fxattrop_cbk, + child, child->fops->fxattrop, + fd, optype, xattr); + + return 0; +} + + +/** + * unify_inodelk_cbk - + */ +int +unify_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +/** + * unify_inodelk + */ +int +unify_inodelk (call_frame_t *frame, xlator_t *this, + loc_t *loc, int cmd, struct flock *flock) +{ + xlator_t *child = NULL; + + child = unify_loc_subvol (loc, this); + + STACK_WIND (frame, unify_inodelk_cbk, + child, child->fops->inodelk, + loc, cmd, flock); + + return 0; +} + + + +/** + * unify_entrylk_cbk - + */ +int +unify_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +/** + * unify_entrylk + */ +int +unify_entrylk (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type) + +{ + xlator_t *child = NULL; + + child = unify_loc_subvol (loc, this); + + STACK_WIND (frame, unify_entrylk_cbk, + child, child->fops->entrylk, + loc, basename, cmd, type); + + return 0; +} + + + +/** + * unify_xattrop_cbk - + */ +int +unify_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr) +{ + STACK_UNWIND (frame, op_ret, op_errno, xattr); + return 0; +} + +/** + * unify_xattrop + */ +int +unify_xattrop (call_frame_t *frame, xlator_t *this, + loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr) +{ + xlator_t *child = NULL; + + child = unify_loc_subvol (loc, this); + + STACK_WIND (frame, unify_xattrop_cbk, + child, child->fops->xattrop, + loc, optype, xattr); + + return 0; +} + + +/** + * notify + */ +int32_t +notify (xlator_t *this, + int32_t event, + void *data, + ...) +{ + unify_private_t *priv = this->private; + struct sched_ops *sched = NULL; + + if (!priv) { + return 0; + } + + sched = priv->sched_ops; + if (!sched) { + gf_log (this->name, GF_LOG_CRITICAL, "No scheduler :O"); + raise (SIGTERM); + return 0; + } + if (priv->namespace == data) { + if (event == GF_EVENT_CHILD_UP) { + sched->notify (this, event, data); + } + return 0; + } + + switch (event) + { + case GF_EVENT_CHILD_UP: + { + /* Call scheduler's update () to enable it for scheduling */ + sched->notify (this, event, data); + + LOCK (&priv->lock); + { + /* Increment the inode's generation, which is + used for self_heal */ + ++priv->inode_generation; + ++priv->num_child_up; + } + UNLOCK (&priv->lock); + + if (!priv->is_up) { + default_notify (this, event, data); + priv->is_up = 1; + } + } + break; + case GF_EVENT_CHILD_DOWN: + { + /* Call scheduler's update () to disable the child node + * for scheduling + */ + sched->notify (this, event, data); + LOCK (&priv->lock); + { + --priv->num_child_up; + } + UNLOCK (&priv->lock); + + if (priv->num_child_up == 0) { + /* Send CHILD_DOWN to upper layer */ + default_notify (this, event, data); + priv->is_up = 0; + } + } + break; + + default: + { + default_notify (this, event, data); + } + break; + } + + return 0; +} + +/** + * init - This function is called first in the xlator, while initializing. + * All the config file options are checked and appropriate flags are set. + * + * @this - + */ +int32_t +init (xlator_t *this) +{ + int32_t ret = 0; + int32_t count = 0; + data_t *scheduler = NULL; + data_t *data = NULL; + xlator_t *ns_xl = NULL; + xlator_list_t *trav = NULL; + xlator_list_t *xlparent = NULL; + xlator_list_t *parent = NULL; + unify_private_t *_private = NULL; + + /* Check for number of child nodes, if there is no child nodes, exit */ + if (!this->children) { + gf_log (this->name, GF_LOG_ERROR, + "No child nodes specified. check \"subvolumes \" " + "option in volfile"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + /* Check for 'scheduler' in volume */ + scheduler = dict_get (this->options, "scheduler"); + if (!scheduler) { + gf_log (this->name, GF_LOG_ERROR, + "\"option scheduler <x>\" is missing in volfile"); + return -1; + } + + /* Setting "option namespace <node>" */ + data = dict_get (this->options, "namespace"); + if(!data) { + gf_log (this->name, GF_LOG_CRITICAL, + "namespace option not specified, Exiting"); + return -1; + } + /* Search namespace in the child node, if found, exit */ + trav = this->children; + while (trav) { + if (strcmp (trav->xlator->name, data->data) == 0) + break; + trav = trav->next; + } + if (trav) { + gf_log (this->name, GF_LOG_CRITICAL, + "namespace node used as a subvolume, Exiting"); + return -1; + } + + /* Search for the namespace node, if found, continue */ + ns_xl = this->next; + while (ns_xl) { + if (strcmp (ns_xl->name, data->data) == 0) + break; + ns_xl = ns_xl->next; + } + if (!ns_xl) { + gf_log (this->name, GF_LOG_CRITICAL, + "namespace node not found in volfile, Exiting"); + return -1; + } + + gf_log (this->name, GF_LOG_DEBUG, + "namespace node specified as %s", data->data); + + _private = CALLOC (1, sizeof (*_private)); + ERR_ABORT (_private); + _private->sched_ops = get_scheduler (this, scheduler->data); + if (!_private->sched_ops) { + gf_log (this->name, GF_LOG_CRITICAL, + "Error while loading scheduler. Exiting"); + FREE (_private); + return -1; + } + + if (ns_xl->parents) { + gf_log (this->name, GF_LOG_CRITICAL, + "Namespace node should not be a child of any other node. Exiting"); + FREE (_private); + return -1; + } + + _private->namespace = ns_xl; + + /* update _private structure */ + { + count = 0; + trav = this->children; + /* Get the number of child count */ + while (trav) { + count++; + trav = trav->next; + } + + gf_log (this->name, GF_LOG_DEBUG, + "Child node count is %d", count); + + _private->child_count = count; + if (count == 1) { + /* TODO: Should I error out here? */ + gf_log (this->name, GF_LOG_CRITICAL, + "WARNING: You have defined only one " + "\"subvolumes\" for unify volume. It may not " + "be the desired config, review your volume " + "volfile. If this is how you are testing it," + " you may hit some performance penalty"); + } + + _private->xl_array = CALLOC (1, + sizeof (xlator_t) * (count + 1)); + ERR_ABORT (_private->xl_array); + + count = 0; + trav = this->children; + while (trav) { + _private->xl_array[count++] = trav->xlator; + trav = trav->next; + } + _private->xl_array[count] = _private->namespace; + + /* self-heal part, start with generation '1' */ + _private->inode_generation = 1; + /* Because, Foreground part is tested well */ + _private->self_heal = ZR_UNIFY_FG_SELF_HEAL; + data = dict_get (this->options, "self-heal"); + if (data) { + if (strcasecmp (data->data, "off") == 0) + _private->self_heal = ZR_UNIFY_SELF_HEAL_OFF; + + if (strcasecmp (data->data, "foreground") == 0) + _private->self_heal = ZR_UNIFY_FG_SELF_HEAL; + + if (strcasecmp (data->data, "background") == 0) + _private->self_heal = ZR_UNIFY_BG_SELF_HEAL; + } + + /* optimist - ask bulde for more about it */ + data = dict_get (this->options, "optimist"); + if (data) { + if (gf_string2boolean (data->data, + &_private->optimist) == -1) { + gf_log (this->name, GF_LOG_ERROR, + "optimist excepts only boolean " + "options"); + } + } + + LOCK_INIT (&_private->lock); + } + + /* Now that everything is fine. */ + this->private = (void *)_private; + { + /* Initialize scheduler, if everything else is successful */ + ret = _private->sched_ops->init (this); + if (ret == -1) { + gf_log (this->name, GF_LOG_CRITICAL, + "Initializing scheduler failed, Exiting"); + FREE (_private); + return -1; + } + + ret = 0; + + /* This section is required because some fops may look + * for 'xl->parent' variable + */ + xlparent = CALLOC (1, sizeof (*xlparent)); + xlparent->xlator = this; + if (!ns_xl->parents) { + ns_xl->parents = xlparent; + } else { + parent = ns_xl->parents; + while (parent->next) + parent = parent->next; + parent->next = xlparent; + } + /* Initialize the namespace volume */ + if (!ns_xl->ready) { + ret = xlator_tree_init (ns_xl); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "initializing namespace node failed, " + "Exiting"); + FREE (_private); + return -1; + } + } + } + + /* Tell namespace node that init is done */ + ns_xl->notify (ns_xl, GF_EVENT_PARENT_UP, this); + + return 0; +} + +/** + * fini - Free all the allocated memory + */ +void +fini (xlator_t *this) +{ + unify_private_t *priv = this->private; + priv->sched_ops->fini (this); + this->private = NULL; + LOCK_DESTROY (&priv->lock); + FREE (priv->xl_array); + FREE (priv); + return; +} + + +struct xlator_fops fops = { + .stat = unify_stat, + .chmod = unify_chmod, + .readlink = unify_readlink, + .mknod = unify_mknod, + .mkdir = unify_mkdir, + .unlink = unify_unlink, + .rmdir = unify_rmdir, + .symlink = unify_symlink, + .rename = unify_rename, + .link = unify_link, + .chown = unify_chown, + .truncate = unify_truncate, + .create = unify_create, + .open = unify_open, + .readv = unify_readv, + .writev = unify_writev, + .statfs = unify_statfs, + .flush = unify_flush, + .fsync = unify_fsync, + .setxattr = unify_setxattr, + .getxattr = unify_getxattr, + .removexattr = unify_removexattr, + .opendir = unify_opendir, + .readdir = unify_readdir, + .fsyncdir = unify_fsyncdir, + .access = unify_access, + .ftruncate = unify_ftruncate, + .fstat = unify_fstat, + .lk = unify_lk, + .fchown = unify_fchown, + .fchmod = unify_fchmod, + .utimens = unify_utimens, + .lookup = unify_lookup, + .getdents = unify_getdents, + .checksum = unify_checksum, + .inodelk = unify_inodelk, + .finodelk = unify_finodelk, + .entrylk = unify_entrylk, + .fentrylk = unify_fentrylk, + .xattrop = unify_xattrop, + .fxattrop = unify_fxattrop +}; + +struct xlator_mops mops = { +}; + +struct xlator_cbks cbks = { +}; + +struct volume_options options[] = { + { .key = { "namespace" }, + .type = GF_OPTION_TYPE_XLATOR + }, + { .key = { "scheduler" }, + .value = { "alu", "rr", "random", "nufa", "switch" }, + .type = GF_OPTION_TYPE_STR + }, + { .key = {"self-heal"}, + .value = { "foreground", "background", "off" }, + .type = GF_OPTION_TYPE_STR + }, + /* TODO: remove it some time later */ + { .key = {"optimist"}, + .type = GF_OPTION_TYPE_BOOL + }, + + { .key = {NULL} }, +}; diff --git a/xlators/cluster/unify/src/unify.h b/xlators/cluster/unify/src/unify.h new file mode 100644 index 000000000..bc18dc53f --- /dev/null +++ b/xlators/cluster/unify/src/unify.h @@ -0,0 +1,132 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#ifndef _UNIFY_H +#define _UNIFY_H + +#include "scheduler.h" +#include "list.h" + +#define MAX_DIR_ENTRY_STRING (32 * 1024) + +#define ZR_UNIFY_SELF_HEAL_OFF 0 +#define ZR_UNIFY_FG_SELF_HEAL 1 +#define ZR_UNIFY_BG_SELF_HEAL 2 + +/* Sometimes one should use completely random numbers.. its good :p */ +#define UNIFY_SELF_HEAL_GETDENTS_COUNT 1024 + +#define NS(xl) (((unify_private_t *)xl->private)->namespace) + +/* This is used to allocate memory for local structure */ +#define INIT_LOCAL(fr, loc) \ +do { \ + loc = CALLOC (1, sizeof (unify_local_t)); \ + ERR_ABORT (loc); \ + if (!loc) { \ + STACK_UNWIND (fr, -1, ENOMEM); \ + return 0; \ + } \ + fr->local = loc; \ + loc->op_ret = -1; \ + loc->op_errno = ENOENT; \ +} while (0) + + + +struct unify_private { + /* Update this structure depending on requirement */ + void *scheduler; /* THIS SHOULD BE THE FIRST VARIABLE, + if xlator is using scheduler */ + struct sched_ops *sched_ops; /* Scheduler options */ + xlator_t *namespace; /* ptr to namespace xlator */ + xlator_t **xl_array; + gf_boolean_t optimist; + int16_t child_count; + int16_t num_child_up; + uint8_t self_heal; + uint8_t is_up; + uint64_t inode_generation; + gf_lock_t lock; +}; +typedef struct unify_private unify_private_t; + +struct unify_self_heal_struct { + uint8_t dir_checksum[ZR_FILENAME_MAX]; + uint8_t ns_dir_checksum[ZR_FILENAME_MAX]; + uint8_t file_checksum[ZR_FILENAME_MAX]; + uint8_t ns_file_checksum[ZR_FILENAME_MAX]; + off_t *offset_list; + int *count_list; + dir_entry_t **entry_list; +}; + + +struct _unify_local_t { + int32_t call_count; + int32_t op_ret; + int32_t op_errno; + mode_t mode; + off_t offset; + dev_t dev; + uid_t uid; + gid_t gid; + int32_t flags; + int32_t entry_count; + int32_t count; // dir_entry_t count; + fd_t *fd; + struct stat stbuf; + struct statvfs statvfs_buf; + struct timespec tv[2]; + char *name; + int32_t revalidate; + + ino_t st_ino; + nlink_t st_nlink; + + dict_t *dict; + + int16_t *list; + int16_t *new_list; /* Used only in case of rename */ + int16_t index; + + int32_t failed; + int32_t return_eio; /* Used in case of different st-mode + present for a given path */ + + uint64_t inode_generation; /* used to store the per directory + * inode_generation. Got from inode's ctx + * of directory inodes + */ + + struct unify_self_heal_struct *sh_struct; + loc_t loc1, loc2; +}; +typedef struct _unify_local_t unify_local_t; + +int32_t zr_unify_self_heal (call_frame_t *frame, + xlator_t *this, + unify_local_t *local); + +#endif /* _UNIFY_H */ |